Repository: qodo-ai/pr-agent Branch: main Commit: aaf8fbe21836 Files: 216 Total size: 1.5 MB Directory structure: gitextract_ey74c0jr/ ├── .dockerignore ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug-report.yml │ │ ├── config.yml │ │ ├── feature-request.yml │ │ └── miscellaneous.yml │ └── workflows/ │ ├── build-and-test.yaml │ ├── code_coverage.yaml │ ├── docs-ci.yaml │ ├── e2e_tests.yaml │ ├── pr-agent-review.yaml │ └── pre-commit.yml ├── .gitignore ├── .pr_agent.toml ├── .pre-commit-config.yaml ├── AGENTS.md ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile.github_action ├── Dockerfile.github_action_dockerhub ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASE_NOTES.md ├── SECURITY.md ├── action.yaml ├── codecov.yml ├── docker/ │ ├── Dockerfile │ └── Dockerfile.lambda ├── docs/ │ ├── README.md │ ├── docs/ │ │ ├── .gitbook.yaml │ │ ├── CNAME │ │ ├── core-abilities/ │ │ │ ├── compression_strategy.md │ │ │ ├── dynamic_context.md │ │ │ ├── fetching_ticket_context.md │ │ │ ├── index.md │ │ │ ├── interactivity.md │ │ │ ├── metadata.md │ │ │ └── self_reflection.md │ │ ├── css/ │ │ │ └── custom.css │ │ ├── faq/ │ │ │ └── index.md │ │ ├── index.md │ │ ├── installation/ │ │ │ ├── azure.md │ │ │ ├── bitbucket.md │ │ │ ├── gitea.md │ │ │ ├── github.md │ │ │ ├── gitlab.md │ │ │ ├── index.md │ │ │ ├── locally.md │ │ │ └── pr_agent.md │ │ ├── overview/ │ │ │ └── data_privacy.md │ │ ├── summary.md │ │ ├── tools/ │ │ │ ├── add_docs.md │ │ │ ├── ask.md │ │ │ ├── describe.md │ │ │ ├── generate_labels.md │ │ │ ├── help.md │ │ │ ├── help_docs.md │ │ │ ├── improve.md │ │ │ ├── index.md │ │ │ ├── review.md │ │ │ ├── similar_issues.md │ │ │ └── update_changelog.md │ │ └── usage-guide/ │ │ ├── EXAMPLE_BEST_PRACTICE.md │ │ ├── additional_configurations.md │ │ ├── automations_and_usage.md │ │ ├── changing_a_model.md │ │ ├── configuration_options.md │ │ ├── index.md │ │ ├── introduction.md │ │ └── mail_notifications.md │ ├── mkdocs.yml │ └── overrides/ │ ├── main.html │ └── partials/ │ ├── footer.html │ └── integrations/ │ └── analytics/ │ └── custom.html ├── github_action/ │ └── entrypoint.sh ├── pr_agent/ │ ├── __init__.py │ ├── agent/ │ │ ├── __init__.py │ │ └── pr_agent.py │ ├── algo/ │ │ ├── __init__.py │ │ ├── ai_handlers/ │ │ │ ├── base_ai_handler.py │ │ │ ├── langchain_ai_handler.py │ │ │ ├── litellm_ai_handler.py │ │ │ ├── litellm_helpers.py │ │ │ └── openai_ai_handler.py │ │ ├── cli_args.py │ │ ├── file_filter.py │ │ ├── git_patch_processing.py │ │ ├── language_handler.py │ │ ├── pr_processing.py │ │ ├── token_handler.py │ │ ├── types.py │ │ └── utils.py │ ├── cli.py │ ├── cli_pip.py │ ├── config_loader.py │ ├── custom_merge_loader.py │ ├── git_providers/ │ │ ├── __init__.py │ │ ├── azuredevops_provider.py │ │ ├── bitbucket_provider.py │ │ ├── bitbucket_server_provider.py │ │ ├── codecommit_client.py │ │ ├── codecommit_provider.py │ │ ├── gerrit_provider.py │ │ ├── git_provider.py │ │ ├── gitea_provider.py │ │ ├── github_provider.py │ │ ├── gitlab_provider.py │ │ ├── local_git_provider.py │ │ └── utils.py │ ├── identity_providers/ │ │ ├── __init__.py │ │ ├── default_identity_provider.py │ │ └── identity_provider.py │ ├── log/ │ │ └── __init__.py │ ├── secret_providers/ │ │ ├── __init__.py │ │ ├── aws_secrets_manager_provider.py │ │ ├── google_cloud_storage_secret_provider.py │ │ └── secret_provider.py │ ├── servers/ │ │ ├── __init__.py │ │ ├── atlassian-connect-qodo-merge.json │ │ ├── atlassian-connect.json │ │ ├── azuredevops_server_webhook.py │ │ ├── bitbucket_app.py │ │ ├── bitbucket_server_webhook.py │ │ ├── gerrit_server.py │ │ ├── gitea_app.py │ │ ├── github_action_runner.py │ │ ├── github_app.py │ │ ├── github_lambda_webhook.py │ │ ├── github_polling.py │ │ ├── gitlab_lambda_webhook.py │ │ ├── gitlab_webhook.py │ │ ├── gunicorn_config.py │ │ ├── help.py │ │ └── utils.py │ ├── settings/ │ │ ├── .secrets_template.toml │ │ ├── code_suggestions/ │ │ │ ├── pr_code_suggestions_prompts.toml │ │ │ ├── pr_code_suggestions_prompts_not_decoupled.toml │ │ │ └── pr_code_suggestions_reflect_prompts.toml │ │ ├── configuration.toml │ │ ├── custom_labels.toml │ │ ├── generated_code_ignore.toml │ │ ├── ignore.toml │ │ ├── language_extensions.toml │ │ ├── pr_add_docs.toml │ │ ├── pr_custom_labels.toml │ │ ├── pr_description_prompts.toml │ │ ├── pr_evaluate_prompt_response.toml │ │ ├── pr_help_docs_headings_prompts.toml │ │ ├── pr_help_docs_prompts.toml │ │ ├── pr_help_prompts.toml │ │ ├── pr_information_from_user_prompts.toml │ │ ├── pr_line_questions_prompts.toml │ │ ├── pr_questions_prompts.toml │ │ ├── pr_reviewer_prompts.toml │ │ └── pr_update_changelog_prompts.toml │ └── tools/ │ ├── __init__.py │ ├── pr_add_docs.py │ ├── pr_code_suggestions.py │ ├── pr_config.py │ ├── pr_description.py │ ├── pr_generate_labels.py │ ├── pr_help_docs.py │ ├── pr_help_message.py │ ├── pr_line_questions.py │ ├── pr_questions.py │ ├── pr_reviewer.py │ ├── pr_similar_issue.py │ ├── pr_update_changelog.py │ └── ticket_pr_compliance_check.py ├── pr_compliance_checklist.yaml ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.py └── tests/ ├── e2e_tests/ │ ├── e2e_utils.py │ ├── langchain_ai_handler.py │ ├── test_bitbucket_app.py │ ├── test_gitea_app.py │ ├── test_github_app.py │ └── test_gitlab_webhook.py ├── health_test/ │ └── main.py └── unittest/ ├── test_add_docs_trigger.py ├── test_aws_secrets_manager_provider.py ├── test_azure_devops_comment.py ├── test_azure_devops_parsing.py ├── test_bitbucket_provider.py ├── test_clip_tokens.py ├── test_codecommit_client.py ├── test_codecommit_provider.py ├── test_config_loader_secrets.py ├── test_convert_to_markdown.py ├── test_delete_hunks.py ├── test_extend_patch.py ├── test_extract_issue_from_branch.py ├── test_fetching_sub_issues.py ├── test_file_filter.py ├── test_find_line_number_of_relevant_line_in_file.py ├── test_fix_json_escape_char.py ├── test_fix_output.py ├── test_fresh_vars_functionality.py ├── test_get_max_tokens.py ├── test_gitea_provider.py ├── test_github_action_output.py ├── test_gitlab_provider.py ├── test_gitlab_webhook_port.py ├── test_handle_patch_deletions.py ├── test_ignore_repositories.py ├── test_language_handler.py ├── test_litellm_reasoning_effort.py ├── test_load_yaml.py ├── test_parse_code_suggestion.py ├── test_pr_update_changelog.py ├── test_secret_provider_factory.py ├── test_similar_issue_non_github.py └── test_try_fix_yaml.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ .venv/ venv/ pr_agent/settings/.secrets.toml pics/ pr_agent.egg-info/ build/ ================================================ FILE: .github/ISSUE_TEMPLATE/bug-report.yml ================================================ name: "\U0001FAB2 Bug Report" description: Submit a bug report labels: ["bug"] body: - type: dropdown id: information-git-provider attributes: label: Git provider description: 'The problem arises when using:' options: - "Github Cloud" - "Github Enterprise" - "Gitlab" - "Bitbucket Cloud" - "Bitbucket Server" - "Azure" - "Other" validations: required: true - type: textarea id: system-info attributes: label: System Info description: Please share your system info with us. placeholder: model used, deployment type (action/app/cli/...), etc... validations: required: true - type: textarea id: bug-details attributes: label: Bug details description: Please describe the problem. placeholder: Describe the problem validations: required: true ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false version: 0.1 contact_links: - name: Discussions url: https://github.com/qodo-ai/pr-agent/discussions about: GitHub Discussions - name: Discord community url: https://discord.com/channels/1057273017547378788/1126104260430528613 about: Join our discord community ================================================ FILE: .github/ISSUE_TEMPLATE/feature-request.yml ================================================ name: "\U0001F4A1 Feature request" description: Submit a proposal/request for a new PR-Agent feature labels: ["feature"] body: - type: textarea id: feature-request validations: required: true attributes: label: Feature request description: | Description of the feature proposal. - type: textarea id: motivation validations: required: true attributes: label: Motivation description: | Outline the motivation for the proposal. ================================================ FILE: .github/ISSUE_TEMPLATE/miscellaneous.yml ================================================ name: "❔ General Issue" description: Submit a general issue labels: ["general"] body: - type: dropdown id: information-git-provider attributes: label: Git provider (optional) description: 'Git Provider:' options: - "Github Cloud" - "Github Enterprise" - "Gitlab" - "Bitbucket Cloud" - "Bitbucket Server" - "Azure" - "Other" - type: textarea id: system-info attributes: label: System Info (optional) description: Please share your system info with us. placeholder: model used, deployment type (action/app/cli/...), etc... validations: required: false - type: textarea id: issues-details attributes: label: Issues details description: Please share the issues details. placeholder: Describe the issue validations: required: true ================================================ FILE: .github/workflows/build-and-test.yaml ================================================ name: Build-and-test on: push: branches: - main pull_request: branches: - main jobs: build-and-test: runs-on: ubuntu-latest steps: - id: checkout uses: actions/checkout@v6 - id: dockerx name: Setup Docker Buildx uses: docker/setup-buildx-action@v3 - id: build name: Build dev docker uses: docker/build-push-action@v6 with: context: . file: ./docker/Dockerfile push: false load: true tags: codiumai/pr-agent:test cache-from: type=gha,scope=dev cache-to: type=gha,mode=max,scope=dev target: test - id: test name: Test dev docker run: | docker run --rm codiumai/pr-agent:test pytest -v tests/unittest ================================================ FILE: .github/workflows/code_coverage.yaml ================================================ name: Code-coverage on: workflow_dispatch: # push: # branches: # - main pull_request: branches: - main jobs: build-and-test: runs-on: ubuntu-latest steps: - id: checkout uses: actions/checkout@v6 - id: dockerx name: Setup Docker Buildx uses: docker/setup-buildx-action@v3 - id: build name: Build dev docker uses: docker/build-push-action@v6 with: context: . file: ./docker/Dockerfile push: false load: true tags: codiumai/pr-agent:test cache-from: type=gha,scope=dev cache-to: type=gha,mode=max,scope=dev target: test - id: code_cov name: Test dev docker run: | docker run --name test_container codiumai/pr-agent:test pytest tests/unittest --cov=pr_agent --cov-report term --cov-report xml:coverage.xml docker cp test_container:/app/coverage.xml coverage.xml docker rm test_container - name: Validate coverage report run: | if [ ! -f coverage.xml ]; then echo "Coverage report not found" exit 1 fi - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 with: token: ${{ secrets.CODECOV_TOKEN }} ================================================ FILE: .github/workflows/docs-ci.yaml ================================================ name: docs-ci on: push: branches: - main - add-docs-portal paths: - docs/** permissions: contents: write jobs: deploy: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Configure Git Credentials run: | git config user.name github-actions[bot] git config user.email 41898282+github-actions[bot]@users.noreply.github.com - uses: actions/setup-python@v5 with: python-version: 3.x - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV - uses: actions/cache@v4 with: key: mkdocs-material-${{ env.cache_id }} path: .cache restore-keys: | mkdocs-material- - run: pip install mkdocs-material - run: pip install "mkdocs-material[imaging]" - run: pip install mkdocs-glightbox - run: mkdocs gh-deploy -f docs/mkdocs.yml --force ================================================ FILE: .github/workflows/e2e_tests.yaml ================================================ name: PR-Agent E2E tests on: workflow_dispatch: # schedule: # - cron: '0 0 * * *' # This cron expression runs the workflow every night at midnight UTC jobs: pr_agent_job: runs-on: ubuntu-latest name: PR-Agent E2E GitHub App Test steps: - name: Checkout repository uses: actions/checkout@v6 - name: Setup Docker Buildx uses: docker/setup-buildx-action@v3 - id: build name: Build dev docker uses: docker/build-push-action@v6 with: context: . file: ./docker/Dockerfile push: false load: true tags: codiumai/pr-agent:test cache-from: type=gha,scope=dev cache-to: type=gha,mode=max,scope=dev target: test - id: test1 name: E2E test github app run: | docker run -e GITHUB.USER_TOKEN=${{ secrets.TOKEN_GITHUB }} --rm codiumai/pr-agent:test pytest -v tests/e2e_tests/test_github_app.py - id: test2 name: E2E gitlab webhook run: | docker run -e gitlab.PERSONAL_ACCESS_TOKEN=${{ secrets.TOKEN_GITLAB }} --rm codiumai/pr-agent:test pytest -v tests/e2e_tests/test_gitlab_webhook.py - id: test3 name: E2E bitbucket app run: | docker run -e BITBUCKET.USERNAME=${{ secrets.BITBUCKET_USERNAME }} -e BITBUCKET.PASSWORD=${{ secrets.BITBUCKET_PASSWORD }} --rm codiumai/pr-agent:test pytest -v tests/e2e_tests/test_bitbucket_app.py ================================================ FILE: .github/workflows/pr-agent-review.yaml ================================================ # This workflow enables developers to call PR-Agents `/[actions]` in PR's comments and upon PR creation. # Learn more at https://www.codium.ai/pr-agent/ # This is v0.2 of this workflow file name: PR-Agent on: # pull_request: # issue_comment: workflow_dispatch: permissions: issues: write pull-requests: write jobs: pr_agent_job: runs-on: ubuntu-latest name: Run pr agent on every pull request steps: - name: PR Agent action step id: pragent uses: Codium-ai/pr-agent@main env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} OPENAI_ORG: ${{ secrets.OPENAI_ORG }} # optional GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PINECONE.API_KEY: ${{ secrets.PINECONE_API_KEY }} PINECONE.ENVIRONMENT: ${{ secrets.PINECONE_ENVIRONMENT }} GITHUB_ACTION_CONFIG.AUTO_DESCRIBE: true GITHUB_ACTION_CONFIG.AUTO_REVIEW: true GITHUB_ACTION_CONFIG.AUTO_IMPROVE: true ================================================ FILE: .github/workflows/pre-commit.yml ================================================ # disabled. We might run it manually if needed. name: pre-commit on: workflow_dispatch: # pull_request: # push: # branches: [main] jobs: pre-commit: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - uses: actions/setup-python@v5 # SEE https://github.com/pre-commit/action - uses: pre-commit/action@v3.0.1 ================================================ FILE: .gitignore ================================================ .idea/ .lsp/ .vscode/ .env .venv/ venv/ pr_agent/settings/.secrets.toml __pycache__ dist/ *.egg-info/ build/ .DS_Store docs/.cache/ .qodo poetry.lock ================================================ FILE: .pr_agent.toml ================================================ [pr_reviewer] enable_review_labels_effort = true enable_auto_approval = true [github_app] pr_commands = [ "/describe --pr_description.publish_description_as_comment=true", "/improve", "/agentic_review" ] handle_push_trigger = true push_commands = [ "/improve", "/agentic_review" ] [review_agent] enabled = true publish_output = true ================================================ FILE: .pre-commit-config.yaml ================================================ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks default_language_version: python: python3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: - id: check-added-large-files - id: check-toml - id: check-yaml - id: end-of-file-fixer - id: trailing-whitespace # - repo: https://github.com/rhysd/actionlint # rev: v1.7.3 # hooks: # - id: actionlint - repo: https://github.com/pycqa/isort # rev must match what's in dev-requirements.txt rev: 5.13.2 hooks: - id: isort # - repo: https://github.com/PyCQA/bandit # rev: 1.7.10 # hooks: # - id: bandit # args: [ # "-c", "pyproject.toml", # ] # - repo: https://github.com/astral-sh/ruff-pre-commit # rev: v0.7.1 # hooks: # - id: ruff # args: # - --fix # - id: ruff-format # - repo: https://github.com/PyCQA/autoflake # rev: v2.3.1 # hooks: # - id: autoflake # args: # - --in-place # - --remove-all-unused-imports # - --remove-unused-variables ================================================ FILE: AGENTS.md ================================================ # Repository Guidelines ## Dos and Don’ts - **Do** match the interpreter requirement declared in `pyproject.toml` (Python ≥ 3.12) and install `requirements.txt` plus `requirements-dev.txt` before running tools. - **Do** run tests with `PYTHONPATH=.` set to keep imports functional (for example `PYTHONPATH=. ./.venv/bin/pytest tests/unittest/test_fix_json_escape_char.py -q`). - **Do** adjust configuration through `.pr_agent.toml` or files under `pr_agent/settings/` instead of hard-coding values. - **Don’t** commit secrets or access tokens; rely on environment variables as shown in the health and e2e tests. - **Don’t** reformat or reorder files globally; match existing 120-character lines, import ordering, and docstring style. - **Don’t** delete or rename configuration, prompt, or workflow files without maintainer approval. ## Project Structure and Module Organization PR-Agent automates AI-assisted reviews for pull requests across multiple git providers. - `pr_agent/agent/` orchestrates commands (`review`, `describe`, `improve`, etc.) via `pr_agent/agent/pr_agent.py`. - `pr_agent/tools/` implements individual capabilities such as reviewers, code suggestions, docs updates, and label generation. - `pr_agent/git_providers/` and `pr_agent/identity_providers/` handle integrations with GitHub, GitLab, Bitbucket, Azure DevOps, and secrets. - `pr_agent/settings/` stores Dynaconf defaults (prompts, configuration templates, ignore lists) respected at runtime; `.pr_agent.toml` overrides repository-level behavior. - `tests/unittest/`, `tests/e2e_tests/`, and `tests/health_test/` contain pytest-based unit, end-to-end, and smoke checks. - `docs/` holds the MkDocs site (`docs/mkdocs.yml` plus content under `docs/docs/`); overrides live in `docs/overrides/`. - `.github/workflows/` defines CI pipelines for unit tests, coverage, docs deployment, pre-commit, and PR-agent self-review. - `docker/` and the root Dockerfiles provide build targets for services (`github_app`, `gitlab_webhook`, etc.) and the `test` stage used in CI. ## Build, Test, and Development Commands - Create or activate a virtual environment, then install runtime dependencies with `pip install -r requirements.txt`; add development tooling via `pip install -r requirements-dev.txt`. - Run a single unit test (verified): `PYTHONPATH=. ./.venv/bin/pytest tests/unittest/test_fix_json_escape_char.py -q`. - Run the full unit suite: `PYTHONPATH=. ./.venv/bin/pytest tests/unittest -v`. - Execute the CLI locally once dependencies and API keys are available: `python -m pr_agent.cli --pr_url review`. - Build the test Docker target mirror of CI when containerizing: `docker build -f docker/Dockerfile --target test .` (loads dev dependencies and copies `tests/`). - Generate and deploy documentation with MkDocs after installing the same extras as CI (`mkdocs-material`, `mkdocs-glightbox`): `mkdocs serve -f docs/mkdocs.yml` for previews and `mkdocs gh-deploy -f docs/mkdocs.yml` for publication. ## Coding Style and Naming Conventions - Python sources follow the Ruff configuration in `pyproject.toml` (`line-length = 120`, Pyflakes plus `flake8-bugbear` checks, and isort ordering). Keep imports grouped as isort would produce and prefer double quotes for strings. - Pre-commit (`.pre-commit-config.yaml`) enforces trailing whitespace cleanup, final newlines, TOML/YAML validity, and optional `isort`; run `pre-commit run --all-files` before submitting patches if installed. - Match existing docstring and comment style—concise English comments using imperative phrasing only where necessary. - Configuration files in `pr_agent/settings/` are TOML; preserve formatting, section order, and comments when editing prompts or defaults. - Markdown in `docs/` uses MkDocs conventions (YAML front matter absent; rely on heading hierarchy already in place). ## Testing Guidelines - Pytest is the standard framework; keep new tests under the closest matching directory (`tests/unittest/` for unit logic, `tests/e2e_tests/` for integration flows, `tests/health_test/` for smoke coverage). - Prefer focused unit tests that isolate helpers in `pr_agent/algo/`, `pr_agent/tools/`, or provider adapters; use parameterized tests where existing files already do so. - Set `PYTHONPATH=.` when invoking pytest from the repository root to avoid import errors. - End-to-end suites require provider tokens (`TOKEN_GITHUB`, `TOKEN_GITLAB`, `BITBUCKET_USERNAME`, `BITBUCKET_PASSWORD`) and may take several minutes; run them only when credentials and sandboxes are configured. - The health test (`tests/health_test/main.py`) exercises `/describe`, `/review`, and `/improve`; update expected artifacts if prompts change meaningfully. ## Commit and Pull Request Guidelines - Follow `CONTRIBUTING.md`: keep changes focused, add or update tests, and use Conventional Commit-style messages (e.g., `fix: handle missing repo settings gracefully`). - Target branch names follow `feature/` or `fix/` patterns for substantial work. - Reference related issues and update README or docs when user-facing behavior shifts. - Ensure CI workflows (`build-and-test`, `code-coverage`, `docs-ci`) succeed locally or in draft PRs before requesting review; reproduce failures with the documented commands above. - Include screenshots or terminal captures when modifying user-visible output or documentation previews. ## Safety and Permissions - Ask for confirmation before adding dependencies, renaming files, or changing workflow definitions; many consumers embed these paths and prompts. - Stay within existing formatting and directory conventions—avoid mass refactors, re-sorting of prompts, or reformatting Markdown beyond the touched sections. - You may read files, list directories, and run targeted lint/test/doc commands without prior approval; coordinate before launching full Docker builds or e2e suites that rely on external credentials. - Never commit cached credentials, API keys, or coverage artifacts; CI already handles secrets through GitHub Actions. - Treat prompt and configuration files as single sources of truth—update mirrors (`.pr_agent.toml`, `pr_agent/settings/*.toml`) together when behavior changes. ## Security and Configuration Tips - Secrets should be supplied through environment variables (see usages in `tests/e2e_tests/test_github_app.py` and `tests/health_test/main.py`); do not persist them in code or configuration files. - Adjust runtime behavior by overriding keys in `.pr_agent.toml` or by supplying repository-specific Dynaconf files; keep overrides minimal and documented inside the PR description. - Review `SECURITY.md` before disclosing vulnerabilities and follow its contact instructions for responsible reporting. ================================================ FILE: CHANGELOG.md ================================================ ## 2023-08-03 ### Optimized - Optimized PR diff processing by introducing caching for diff files, reducing the number of API calls. - Refactored `load_large_diff` function to generate a patch only when necessary. - Fixed a bug in the GitLab provider where the new file was not retrieved correctly. ## 2023-08-02 ### Enhanced - Updated several tools in the `pr_agent` package to use commit messages in their functionality. - Commit messages are now retrieved and stored in the `vars` dictionary for each tool. - Added a section to display the commit messages in the prompts of various tools. ## 2023-08-01 ### Enhanced - Introduced the ability to retrieve commit messages from pull requests across different git providers. - Implemented commit messages retrieval for GitHub and GitLab providers. - Updated the PR description template to include a section for commit messages if they exist. - Added support for repository-specific configuration files (.pr_agent.yaml) for the PR Agent. - Implemented this feature for both GitHub and GitLab providers. - Added a new configuration option 'use_repo_settings_file' to enable or disable the use of a repo-specific settings file. ## 2023-07-30 ### Enhanced - Added the ability to modify any configuration parameter from 'configuration.toml' on-the-fly. - Updated the command line interface and bot commands to accept configuration changes as arguments. - Improved the PR agent to handle additional arguments for each action. ## 2023-07-28 ### Improved - Enhanced error handling and logging in the GitLab provider. - Improved handling of inline comments and code suggestions in GitLab. - Fixed a bug where an additional unneeded line was added to code suggestions in GitLab. ## 2023-07-26 ### Added - New feature for updating the CHANGELOG.md based on the contents of a PR. - Added support for this feature for the Github provider. - New configuration settings and prompts for the changelog update feature. ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Code of Conduct As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities. We are committed to making participation in this project a harassment-free experience for everyone, regardless of level of experience, gender, gender identity and expression, sexual orientation, disability, personal appearance, body size, race, ethnicity, age, religion, or nationality. Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery * Personal attacks * Trolling or insulting/derogatory comments * Public or private harassment * Publishing other's private information, such as physical or electronic addresses, without explicit permission * Other unethical or unprofessional conduct Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. By adopting this Code of Conduct, project maintainers commit themselves to fairly and consistently applying these principles to every aspect of managing this project. Project maintainers who do not follow or enforce the Code of Conduct may be permanently removed from the project team. This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting a project maintainer at dana.f@qodo.ai . All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. Maintainers are obligated to maintain confidentiality with regard to the reporter of an incident. This Code of Conduct is adapted from the [Contributor Covenant](https://contributor-covenant.org), version 1.3.0, available at [contributor-covenant.org/version/1/3/0/](https://contributor-covenant.org/version/1/3/0/) ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to PR-Agent Thank you for your interest in contributing to the PR-Agent project! ## Getting Started 1. Fork the repository and clone your fork 2. Install Python 3.10 or higher 3. Install dependencies (`requirements.txt` and `requirements-dev.txt`) 4. Create a new branch for your contribution: - For new features: `git checkout -b feature/your-feature-name` - For bug fixes: `git checkout -b fix/issue-description` 5. Make your changes 6. Write or update tests as needed 7. Run tests locally to ensure everything passes 8. Commit your changes using conventional commit messages 9. Push to your fork and submit a pull request ## Development Guidelines - Keep pull requests focused on a single feature or fix - Follow the existing code style and formatting conventions - Add unit tests for any new functionality using pytest - Ensure test coverage for your changes - Update documentation as needed ## Pull Request Process 1. Ensure your PR includes a clear description of the changes 2. Link any related issues 3. Update the README.md if needed 4. Wait for review from maintainers ## Questions or Need Help? - Join our [Discord community](https://discord.com/channels/1057273017547378788/1126104260430528613) for questions and discussions - Check the [documentation](https://qodo-merge-docs.qodo.ai/) for detailed information - Report bugs or request features through [GitHub Issues](https://github.com/qodo-ai/pr-agent/issues) ================================================ FILE: Dockerfile.github_action ================================================ FROM python:3.12.10-slim AS base RUN apt-get update && apt-get install --no-install-recommends -y git curl && apt-get clean && rm -rf /var/lib/apt/lists/* WORKDIR /app ADD pyproject.toml . ADD requirements.txt . RUN pip install --no-cache-dir . && rm pyproject.toml requirements.txt ENV PYTHONPATH=/app ADD docs docs ADD pr_agent pr_agent ADD github_action/entrypoint.sh / RUN chmod +x /entrypoint.sh ENTRYPOINT ["/entrypoint.sh"] ================================================ FILE: Dockerfile.github_action_dockerhub ================================================ FROM codiumai/pr-agent:github_action ================================================ FILE: LICENSE ================================================ GNU AFFERO GENERAL PUBLIC LICENSE Version 3, 19 November 2007 Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble The GNU Affero General Public License is a free, copyleft license for software and other kinds of works, specifically designed to ensure cooperation with the community in the case of network server software. The licenses for most software and other practical works are designed to take away your freedom to share and change the works. By contrast, our General Public Licenses are intended to guarantee your freedom to share and change all versions of a program--to make sure it remains free software for all its users. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for them if you wish), that you receive source code or can get it if you want it, that you can change the software or use pieces of it in new free programs, and that you know you can do these things. Developers that use our General Public Licenses protect your rights with two steps: (1) assert copyright on the software, and (2) offer you this License which gives you legal permission to copy, distribute and/or modify the software. A secondary benefit of defending all users' freedom is that improvements made in alternate versions of the program, if they receive widespread use, become available for other developers to incorporate. Many developers of free software are heartened and encouraged by the resulting cooperation. However, in the case of software used on network servers, this result may fail to come about. The GNU General Public License permits making a modified version and letting the public access it on a server without ever releasing its source code to the public. The GNU Affero General Public License is designed specifically to ensure that, in such cases, the modified source code becomes available to the community. It requires the operator of a network server to provide the source code of the modified version running there to the users of that server. Therefore, public use of a modified version, on a publicly accessible server, gives the public access to the source code of the modified version. An older license, called the Affero General Public License and published by Affero, was designed to accomplish similar goals. This is a different license, not a version of the Affero GPL, but Affero has released a new version of the Affero GPL which permits relicensing under this license. The precise terms and conditions for copying, distribution and modification follow. TERMS AND CONDITIONS 0. Definitions. "This License" refers to version 3 of the GNU Affero General Public License. "Copyright" also means copyright-like laws that apply to other kinds of works, such as semiconductor masks. "The Program" refers to any copyrightable work licensed under this License. Each licensee is addressed as "you". "Licensees" and "recipients" may be individuals or organizations. To "modify" a work means to copy from or adapt all or part of the work in a fashion requiring copyright permission, other than the making of an exact copy. The resulting work is called a "modified version" of the earlier work or a work "based on" the earlier work. A "covered work" means either the unmodified Program or a work based on the Program. To "propagate" a work means to do anything with it that, without permission, would make you directly or secondarily liable for infringement under applicable copyright law, except executing it on a computer or modifying a private copy. Propagation includes copying, distribution (with or without modification), making available to the public, and in some countries other activities as well. To "convey" a work means any kind of propagation that enables other parties to make or receive copies. Mere interaction with a user through a computer network, with no transfer of a copy, is not conveying. An interactive user interface displays "Appropriate Legal Notices" to the extent that it includes a convenient and prominently visible feature that (1) displays an appropriate copyright notice, and (2) tells the user that there is no warranty for the work (except to the extent that warranties are provided), that licensees may convey the work under this License, and how to view a copy of this License. If the interface presents a list of user commands or options, such as a menu, a prominent item in the list meets this criterion. 1. Source Code. The "source code" for a work means the preferred form of the work for making modifications to it. "Object code" means any non-source form of a work. A "Standard Interface" means an interface that either is an official standard defined by a recognized standards body, or, in the case of interfaces specified for a particular programming language, one that is widely used among developers working in that language. The "System Libraries" of an executable work include anything, other than the work as a whole, that (a) is included in the normal form of packaging a Major Component, but which is not part of that Major Component, and (b) serves only to enable use of the work with that Major Component, or to implement a Standard Interface for which an implementation is available to the public in source code form. A "Major Component", in this context, means a major essential component (kernel, window system, and so on) of the specific operating system (if any) on which the executable work runs, or a compiler used to produce the work, or an object code interpreter used to run it. The "Corresponding Source" for a work in object code form means all the source code needed to generate, install, and (for an executable work) run the object code and to modify the work, including scripts to control those activities. However, it does not include the work's System Libraries, or general-purpose tools or generally available free programs which are used unmodified in performing those activities but which are not part of the work. For example, Corresponding Source includes interface definition files associated with source files for the work, and the source code for shared libraries and dynamically linked subprograms that the work is specifically designed to require, such as by intimate data communication or control flow between those subprograms and other parts of the work. The Corresponding Source need not include anything that users can regenerate automatically from other parts of the Corresponding Source. The Corresponding Source for a work in source code form is that same work. 2. Basic Permissions. All rights granted under this License are granted for the term of copyright on the Program, and are irrevocable provided the stated conditions are met. This License explicitly affirms your unlimited permission to run the unmodified Program. The output from running a covered work is covered by this License only if the output, given its content, constitutes a covered work. This License acknowledges your rights of fair use or other equivalent, as provided by copyright law. You may make, run and propagate covered works that you do not convey, without conditions so long as your license otherwise remains in force. You may convey covered works to others for the sole purpose of having them make modifications exclusively for you, or provide you with facilities for running those works, provided that you comply with the terms of this License in conveying all material for which you do not control copyright. Those thus making or running the covered works for you must do so exclusively on your behalf, under your direction and control, on terms that prohibit them from making any copies of your copyrighted material outside their relationship with you. Conveying under any other circumstances is permitted solely under the conditions stated below. Sublicensing is not allowed; section 10 makes it unnecessary. 3. Protecting Users' Legal Rights From Anti-Circumvention Law. No covered work shall be deemed part of an effective technological measure under any applicable law fulfilling obligations under article 11 of the WIPO copyright treaty adopted on 20 December 1996, or similar laws prohibiting or restricting circumvention of such measures. When you convey a covered work, you waive any legal power to forbid circumvention of technological measures to the extent such circumvention is effected by exercising rights under this License with respect to the covered work, and you disclaim any intention to limit operation or modification of the work as a means of enforcing, against the work's users, your or third parties' legal rights to forbid circumvention of technological measures. 4. Conveying Verbatim Copies. You may convey verbatim copies of the Program's source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice; keep intact all notices stating that this License and any non-permissive terms added in accord with section 7 apply to the code; keep intact all notices of the absence of any warranty; and give all recipients a copy of this License along with the Program. You may charge any price or no price for each copy that you convey, and you may offer support or warranty protection for a fee. 5. Conveying Modified Source Versions. You may convey a work based on the Program, or the modifications to produce it from the Program, in the form of source code under the terms of section 4, provided that you also meet all of these conditions: a) The work must carry prominent notices stating that you modified it, and giving a relevant date. b) The work must carry prominent notices stating that it is released under this License and any conditions added under section 7. This requirement modifies the requirement in section 4 to "keep intact all notices". c) You must license the entire work, as a whole, under this License to anyone who comes into possession of a copy. This License will therefore apply, along with any applicable section 7 additional terms, to the whole of the work, and all its parts, regardless of how they are packaged. This License gives no permission to license the work in any other way, but it does not invalidate such permission if you have separately received it. d) If the work has interactive user interfaces, each must display Appropriate Legal Notices; however, if the Program has interactive interfaces that do not display Appropriate Legal Notices, your work need not make them do so. A compilation of a covered work with other separate and independent works, which are not by their nature extensions of the covered work, and which are not combined with it such as to form a larger program, in or on a volume of a storage or distribution medium, is called an "aggregate" if the compilation and its resulting copyright are not used to limit the access or legal rights of the compilation's users beyond what the individual works permit. Inclusion of a covered work in an aggregate does not cause this License to apply to the other parts of the aggregate. 6. Conveying Non-Source Forms. You may convey a covered work in object code form under the terms of sections 4 and 5, provided that you also convey the machine-readable Corresponding Source under the terms of this License, in one of these ways: a) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by the Corresponding Source fixed on a durable physical medium customarily used for software interchange. b) Convey the object code in, or embodied in, a physical product (including a physical distribution medium), accompanied by a written offer, valid for at least three years and valid for as long as you offer spare parts or customer support for that product model, to give anyone who possesses the object code either (1) a copy of the Corresponding Source for all the software in the product that is covered by this License, on a durable physical medium customarily used for software interchange, for a price no more than your reasonable cost of physically performing this conveying of source, or (2) access to copy the Corresponding Source from a network server at no charge. c) Convey individual copies of the object code with a copy of the written offer to provide the Corresponding Source. This alternative is allowed only occasionally and noncommercially, and only if you received the object code with such an offer, in accord with subsection 6b. d) Convey the object code by offering access from a designated place (gratis or for a charge), and offer equivalent access to the Corresponding Source in the same way through the same place at no further charge. You need not require recipients to copy the Corresponding Source along with the object code. If the place to copy the object code is a network server, the Corresponding Source may be on a different server (operated by you or a third party) that supports equivalent copying facilities, provided you maintain clear directions next to the object code saying where to find the Corresponding Source. Regardless of what server hosts the Corresponding Source, you remain obligated to ensure that it is available for as long as needed to satisfy these requirements. e) Convey the object code using peer-to-peer transmission, provided you inform other peers where the object code and Corresponding Source of the work are being offered to the general public at no charge under subsection 6d. A separable portion of the object code, whose source code is excluded from the Corresponding Source as a System Library, need not be included in conveying the object code work. A "User Product" is either (1) a "consumer product", which means any tangible personal property which is normally used for personal, family, or household purposes, or (2) anything designed or sold for incorporation into a dwelling. In determining whether a product is a consumer product, doubtful cases shall be resolved in favor of coverage. For a particular product received by a particular user, "normally used" refers to a typical or common use of that class of product, regardless of the status of the particular user or of the way in which the particular user actually uses, or expects or is expected to use, the product. A product is a consumer product regardless of whether the product has substantial commercial, industrial or non-consumer uses, unless such uses represent the only significant mode of use of the product. "Installation Information" for a User Product means any methods, procedures, authorization keys, or other information required to install and execute modified versions of a covered work in that User Product from a modified version of its Corresponding Source. The information must suffice to ensure that the continued functioning of the modified object code is in no case prevented or interfered with solely because modification has been made. If you convey an object code work under this section in, or with, or specifically for use in, a User Product, and the conveying occurs as part of a transaction in which the right of possession and use of the User Product is transferred to the recipient in perpetuity or for a fixed term (regardless of how the transaction is characterized), the Corresponding Source conveyed under this section must be accompanied by the Installation Information. But this requirement does not apply if neither you nor any third party retains the ability to install modified object code on the User Product (for example, the work has been installed in ROM). The requirement to provide Installation Information does not include a requirement to continue to provide support service, warranty, or updates for a work that has been modified or installed by the recipient, or for the User Product in which it has been modified or installed. Access to a network may be denied when the modification itself materially and adversely affects the operation of the network or violates the rules and protocols for communication across the network. Corresponding Source conveyed, and Installation Information provided, in accord with this section must be in a format that is publicly documented (and with an implementation available to the public in source code form), and must require no special password or key for unpacking, reading or copying. 7. Additional Terms. "Additional permissions" are terms that supplement the terms of this License by making exceptions from one or more of its conditions. Additional permissions that are applicable to the entire Program shall be treated as though they were included in this License, to the extent that they are valid under applicable law. If additional permissions apply only to part of the Program, that part may be used separately under those permissions, but the entire Program remains governed by this License without regard to the additional permissions. When you convey a copy of a covered work, you may at your option remove any additional permissions from that copy, or from any part of it. (Additional permissions may be written to require their own removal in certain cases when you modify the work.) You may place additional permissions on material, added by you to a covered work, for which you have or can give appropriate copyright permission. Notwithstanding any other provision of this License, for material you add to a covered work, you may (if authorized by the copyright holders of that material) supplement the terms of this License with terms: a) Disclaiming warranty or limiting liability differently from the terms of sections 15 and 16 of this License; or b) Requiring preservation of specified reasonable legal notices or author attributions in that material or in the Appropriate Legal Notices displayed by works containing it; or c) Prohibiting misrepresentation of the origin of that material, or requiring that modified versions of such material be marked in reasonable ways as different from the original version; or d) Limiting the use for publicity purposes of names of licensors or authors of the material; or e) Declining to grant rights under trademark law for use of some trade names, trademarks, or service marks; or f) Requiring indemnification of licensors and authors of that material by anyone who conveys the material (or modified versions of it) with contractual assumptions of liability to the recipient, for any liability that these contractual assumptions directly impose on those licensors and authors. All other non-permissive additional terms are considered "further restrictions" within the meaning of section 10. If the Program as you received it, or any part of it, contains a notice stating that it is governed by this License along with a term that is a further restriction, you may remove that term. If a license document contains a further restriction but permits relicensing or conveying under this License, you may add to a covered work material governed by the terms of that license document, provided that the further restriction does not survive such relicensing or conveying. If you add terms to a covered work in accord with this section, you must place, in the relevant source files, a statement of the additional terms that apply to those files, or a notice indicating where to find the applicable terms. Additional terms, permissive or non-permissive, may be stated in the form of a separately written license, or stated as exceptions; the above requirements apply either way. 8. Termination. You may not propagate or modify a covered work except as expressly provided under this License. Any attempt otherwise to propagate or modify it is void, and will automatically terminate your rights under this License (including any patent licenses granted under the third paragraph of section 11). However, if you cease all violation of this License, then your license from a particular copyright holder is reinstated (a) provisionally, unless and until the copyright holder explicitly and finally terminates your license, and (b) permanently, if the copyright holder fails to notify you of the violation by some reasonable means prior to 60 days after the cessation. Moreover, your license from a particular copyright holder is reinstated permanently if the copyright holder notifies you of the violation by some reasonable means, this is the first time you have received notice of violation of this License (for any work) from that copyright holder, and you cure the violation prior to 30 days after your receipt of the notice. Termination of your rights under this section does not terminate the licenses of parties who have received copies or rights from you under this License. If your rights have been terminated and not permanently reinstated, you do not qualify to receive new licenses for the same material under section 10. 9. Acceptance Not Required for Having Copies. You are not required to accept this License in order to receive or run a copy of the Program. Ancillary propagation of a covered work occurring solely as a consequence of using peer-to-peer transmission to receive a copy likewise does not require acceptance. However, nothing other than this License grants you permission to propagate or modify any covered work. These actions infringe copyright if you do not accept this License. Therefore, by modifying or propagating a covered work, you indicate your acceptance of this License to do so. 10. Automatic Licensing of Downstream Recipients. Each time you convey a covered work, the recipient automatically receives a license from the original licensors, to run, modify and propagate that work, subject to this License. You are not responsible for enforcing compliance by third parties with this License. An "entity transaction" is a transaction transferring control of an organization, or substantially all assets of one, or subdividing an organization, or merging organizations. If propagation of a covered work results from an entity transaction, each party to that transaction who receives a copy of the work also receives whatever licenses to the work the party's predecessor in interest had or could give under the previous paragraph, plus a right to possession of the Corresponding Source of the work from the predecessor in interest, if the predecessor has it or can get it with reasonable efforts. You may not impose any further restrictions on the exercise of the rights granted or affirmed under this License. For example, you may not impose a license fee, royalty, or other charge for exercise of rights granted under this License, and you may not initiate litigation (including a cross-claim or counterclaim in a lawsuit) alleging that any patent claim is infringed by making, using, selling, offering for sale, or importing the Program or any portion of it. 11. Patents. A "contributor" is a copyright holder who authorizes use under this License of the Program or a work on which the Program is based. The work thus licensed is called the contributor's "contributor version". A contributor's "essential patent claims" are all patent claims owned or controlled by the contributor, whether already acquired or hereafter acquired, that would be infringed by some manner, permitted by this License, of making, using, or selling its contributor version, but do not include claims that would be infringed only as a consequence of further modification of the contributor version. For purposes of this definition, "control" includes the right to grant patent sublicenses in a manner consistent with the requirements of this License. Each contributor grants you a non-exclusive, worldwide, royalty-free patent license under the contributor's essential patent claims, to make, use, sell, offer for sale, import and otherwise run, modify and propagate the contents of its contributor version. In the following three paragraphs, a "patent license" is any express agreement or commitment, however denominated, not to enforce a patent (such as an express permission to practice a patent or covenant not to sue for patent infringement). To "grant" such a patent license to a party means to make such an agreement or commitment not to enforce a patent against the party. If you convey a covered work, knowingly relying on a patent license, and the Corresponding Source of the work is not available for anyone to copy, free of charge and under the terms of this License, through a publicly available network server or other readily accessible means, then you must either (1) cause the Corresponding Source to be so available, or (2) arrange to deprive yourself of the benefit of the patent license for this particular work, or (3) arrange, in a manner consistent with the requirements of this License, to extend the patent license to downstream recipients. "Knowingly relying" means you have actual knowledge that, but for the patent license, your conveying the covered work in a country, or your recipient's use of the covered work in a country, would infringe one or more identifiable patents in that country that you have reason to believe are valid. If, pursuant to or in connection with a single transaction or arrangement, you convey, or propagate by procuring conveyance of, a covered work, and grant a patent license to some of the parties receiving the covered work authorizing them to use, propagate, modify or convey a specific copy of the covered work, then the patent license you grant is automatically extended to all recipients of the covered work and works based on it. A patent license is "discriminatory" if it does not include within the scope of its coverage, prohibits the exercise of, or is conditioned on the non-exercise of one or more of the rights that are specifically granted under this License. You may not convey a covered work if you are a party to an arrangement with a third party that is in the business of distributing software, under which you make payment to the third party based on the extent of your activity of conveying the work, and under which the third party grants, to any of the parties who would receive the covered work from you, a discriminatory patent license (a) in connection with copies of the covered work conveyed by you (or copies made from those copies), or (b) primarily for and in connection with specific products or compilations that contain the covered work, unless you entered into that arrangement, or that patent license was granted, prior to 28 March 2007. Nothing in this License shall be construed as excluding or limiting any implied license or other defenses to infringement that may otherwise be available to you under applicable patent law. 12. No Surrender of Others' Freedom. If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot convey a covered work so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not convey it at all. For example, if you agree to terms that obligate you to collect a royalty for further conveying from those to whom you convey the Program, the only way you could satisfy both those terms and this License would be to refrain entirely from conveying the Program. 13. Remote Network Interaction; Use with the GNU General Public License. Notwithstanding any other provision of this License, if you modify the Program, your modified version must prominently offer all users interacting with it remotely through a computer network (if your version supports such interaction) an opportunity to receive the Corresponding Source of your version by providing access to the Corresponding Source from a network server at no charge, through some standard or customary means of facilitating copying of software. This Corresponding Source shall include the Corresponding Source for any work covered by version 3 of the GNU General Public License that is incorporated pursuant to the following paragraph. Notwithstanding any other provision of this License, you have permission to link or combine any covered work with a work licensed under version 3 of the GNU General Public License into a single combined work, and to convey the resulting work. The terms of this License will continue to apply to the part which is the covered work, but the work with which it is combined will remain governed by version 3 of the GNU General Public License. 14. Revised Versions of this License. The Free Software Foundation may publish revised and/or new versions of the GNU Affero General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Program specifies that a certain numbered version of the GNU Affero General Public License "or any later version" applies to it, you have the option of following the terms and conditions either of that numbered version or of any later version published by the Free Software Foundation. If the Program does not specify a version number of the GNU Affero General Public License, you may choose any version ever published by the Free Software Foundation. If the Program specifies that a proxy can decide which future versions of the GNU Affero General Public License can be used, that proxy's public statement of acceptance of a version permanently authorizes you to choose that version for the Program. Later license versions may give you additional or different permissions. However, no additional obligations are imposed on any author or copyright holder as a result of your choosing to follow a later version. 15. Disclaimer of Warranty. THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. Limitation of Liability. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 17. Interpretation of Sections 15 and 16. If the disclaimer of warranty and limitation of liability provided above cannot be given local legal effect according to their terms, reviewing courts shall apply local law that most closely approximates an absolute waiver of all civil liability in connection with the Program, unless a warranty or assumption of liability accompanies a copy of the Program in return for a fee. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Programs If you develop a new program, and you want it to be of the greatest possible use to the public, the best way to achieve this is to make it free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. If your software can interact with users remotely through a computer network, you should also make sure that it provides a way for users to get its source. For example, if your program is a web application, its interface could display a "Source" link that leads users to an archive of the code. There are many ways you could offer source, and different solutions will be better for different programs; see section 13 for the specific requirements. You should also get your employer (if you work as a programmer) or school, if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . ================================================ FILE: MANIFEST.in ================================================ recursive-include pr_agent *.toml recursive-exclude pr_agent *.secrets.toml ================================================ FILE: README.md ================================================ GitHub
# 🚀 PR Agent - The Original Open-Source PR Reviewer. This repository contains the open-source PR Agent Project. It is not the Qodo free tier. Try the free version on our website. 👉[Get Started Now](www.qodo.ai/get-started/) PR-Agent is an open-source, AI-powered code review agent and a community-maintained legacy project of Qodo. It is distinct from Qodo’s primary AI code review offering, which provides a feature-rich, context-aware experience. Qodo now offers a free tier that integrates seamlessly with GitHub, GitLab, Bitbucket, and Azure DevOps for high-quality automated reviews. ## Table of Contents - [Getting Started](#getting-started) - [Why Use PR-Agent?](#why-use-pr-agent) - [Features](#features) - [See It in Action](#see-it-in-action) - [Try It Now](#try-it-now) - [How It Works](#how-it-works) - [Data Privacy](#data-privacy) - [Contributing](#contributing) ## Getting Started ### 🚀 Quick Start for PR-Agent #### 1. Try it Instantly (No Setup) Test PR-Agent on any public GitHub repository by commenting `@CodiumAI-Agent /improve` #### 2. GitHub Action (Recommended) Add automated PR reviews to your repository with a simple workflow file: ```yaml # .github/workflows/pr-agent.yml name: PR Agent on: pull_request: types: [opened, synchronize] jobs: pr_agent_job: runs-on: ubuntu-latest steps: - name: PR Agent action step uses: Codium-ai/pr-agent@main env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ``` [Full GitHub Action setup guide](https://qodo-merge-docs.qodo.ai/installation/github/#run-as-a-github-action) #### 3. CLI Usage (Local Development) Run PR-Agent locally on your repository: ```bash pip install pr-agent export OPENAI_KEY=your_key_here pr-agent --pr_url https://github.com/owner/repo/pull/123 review ``` [Complete CLI setup guide](https://qodo-merge-docs.qodo.ai/usage-guide/automations_and_usage/#local-repo-cli) #### 4. Other Platforms - [GitLab webhook setup](https://qodo-merge-docs.qodo.ai/installation/gitlab/) - [BitBucket app installation](https://qodo-merge-docs.qodo.ai/installation/bitbucket/) - [Azure DevOps setup](https://qodo-merge-docs.qodo.ai/installation/azure/) [//]: # (## News and Updates) [//]: # () [//]: # (## Aug 8, 2025) [//]: # () [//]: # () [//]: # () [//]: # (## Jul 17, 2025) [//]: # () [//]: # (Introducing `/compliance`, a new Qodo Merge 💎 tool that runs comprehensive checks for security, ticket requirements, codebase duplication, and custom organizational rules. ) [//]: # () [//]: # (compliance-image) [//]: # () [//]: # (Read more about it [here](https://qodo-merge-docs.qodo.ai/tools/compliance/)) [//]: # () [//]: # () [//]: # (## Jul 1, 2025) [//]: # (You can now receive automatic feedback from Qodo Merge in your local IDE after each commit. Read more about it [here](https://github.com/qodo-ai/agents/tree/main/agents/qodo-merge-post-commit).) [//]: # () [//]: # () [//]: # (## Jun 21, 2025) [//]: # () [//]: # (v0.30 was [released](https://github.com/qodo-ai/pr-agent/releases)) [//]: # () [//]: # () [//]: # (## Jun 3, 2025) [//]: # () [//]: # (Qodo Merge now offers a simplified free tier 💎.) [//]: # (Organizations can use Qodo Merge at no cost, with a [monthly limit](https://qodo-merge-docs.qodo.ai/installation/qodo_merge/#cloud-users) of 75 PR reviews per organization.) [//]: # () [//]: # () [//]: # (## Apr 30, 2025) [//]: # () [//]: # (A new feature is now available in the `/improve` tool for Qodo Merge 💎 - Chat on code suggestions.) [//]: # () [//]: # (image) [//]: # () [//]: # (Read more about it [here](https://qodo-merge-docs.qodo.ai/tools/improve/#chat-on-code-suggestions).) [//]: # () [//]: # () [//]: # (## Apr 16, 2025) [//]: # () [//]: # (New tool for Qodo Merge 💎 - `/scan_repo_discussions`.) [//]: # () [//]: # (image) [//]: # () [//]: # (Read more about it [here](https://qodo-merge-docs.qodo.ai/tools/scan_repo_discussions/).) ## Why Use PR-Agent? ### 🎯 Built for Real Development Teams **Fast & Affordable**: Each tool (`/review`, `/improve`, `/ask`) uses a single LLM call (~30 seconds, low cost) **Handles Any PR Size**: Our [PR Compression strategy](https://qodo-merge-docs.qodo.ai/core-abilities/#pr-compression-strategy) effectively processes both small and large PRs **Highly Customizable**: JSON-based prompting allows easy customization of review categories and behavior via [configuration files](pr_agent/settings/configuration.toml) **Platform Agnostic**: - **Git Providers**: GitHub, GitLab, BitBucket, Azure DevOps, Gitea - **Deployment**: CLI, GitHub Actions, Docker, self-hosted, webhooks - **AI Models**: OpenAI GPT, Claude, Deepseek, and more **Open Source Benefits**: - Full control over your data and infrastructure - Customize prompts and behavior for your team's needs - No vendor lock-in - Community-driven development ## Features
PR-Agent offers comprehensive pull request functionalities integrated with various git providers: | | | GitHub | GitLab | Bitbucket | Azure DevOps | Gitea | |---------------------------------------------------------|----------------------------------------------------------------------------------------|:------:|:------:|:---------:|:------------:|:-----:| | [TOOLS](https://qodo-merge-docs.qodo.ai/tools/) | [Describe](https://qodo-merge-docs.qodo.ai/tools/describe/) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [Review](https://qodo-merge-docs.qodo.ai/tools/review/) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [Improve](https://qodo-merge-docs.qodo.ai/tools/improve/) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [Ask](https://qodo-merge-docs.qodo.ai/tools/ask/) | ✅ | ✅ | ✅ | ✅ | | | | ⮑ [Ask on code lines](https://qodo-merge-docs.qodo.ai/tools/ask/#ask-lines) | ✅ | ✅ | | | | | | [Help Docs](https://qodo-merge-docs.qodo.ai/tools/help_docs/?h=auto#auto-approval) | ✅ | ✅ | ✅ | | | | | [Update CHANGELOG](https://qodo-merge-docs.qodo.ai/tools/update_changelog/) | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | [USAGE](https://qodo-merge-docs.qodo.ai/usage-guide/) | [CLI](https://qodo-merge-docs.qodo.ai/usage-guide/automations_and_usage/#local-repo-cli) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [App / webhook](https://qodo-merge-docs.qodo.ai/usage-guide/automations_and_usage/#github-app) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [Tagging bot](https://github.com/Codium-ai/pr-agent#try-it-now) | ✅ | | | | | | | [Actions](https://qodo-merge-docs.qodo.ai/installation/github/#run-as-a-github-action) | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | [CORE](https://qodo-merge-docs.qodo.ai/core-abilities/) | [Adaptive and token-aware file patch fitting](https://qodo-merge-docs.qodo.ai/core-abilities/compression_strategy/) | ✅ | ✅ | ✅ | ✅ | | | | [Chat on code suggestions](https://qodo-merge-docs.qodo.ai/core-abilities/chat_on_code_suggestions/) | ✅ | ✅ | | | | | | [Dynamic context](https://qodo-merge-docs.qodo.ai/core-abilities/dynamic_context/) | ✅ | ✅ | ✅ | ✅ | | | | [Fetching ticket context](https://qodo-merge-docs.qodo.ai/core-abilities/fetching_ticket_context/) | ✅ | ✅ | ✅ | | | | | [Incremental Update](https://qodo-merge-docs.qodo.ai/core-abilities/incremental_update/) | ✅ | | | | | | | [Interactivity](https://qodo-merge-docs.qodo.ai/core-abilities/interactivity/) | ✅ | ✅ | | | | | | [Local and global metadata](https://qodo-merge-docs.qodo.ai/core-abilities/metadata/) | ✅ | ✅ | ✅ | ✅ | | | | [Multiple models support](https://qodo-merge-docs.qodo.ai/usage-guide/changing_a_model/) | ✅ | ✅ | ✅ | ✅ | | | | [PR compression](https://qodo-merge-docs.qodo.ai/core-abilities/compression_strategy/) | ✅ | ✅ | ✅ | ✅ | | | | [RAG context enrichment](https://qodo-merge-docs.qodo.ai/core-abilities/rag_context_enrichment/) | ✅ | | ✅ | | | | | [Self reflection](https://qodo-merge-docs.qodo.ai/core-abilities/self_reflection/) | ✅ | ✅ | ✅ | ✅ | | [//]: # (- Support for additional git providers is described in [here](./docs/Full_environments.md)) ___ ## See It in Action

/describe


/review


/improve


## Try It Now Try the GPT-5 powered PR-Agent instantly on _your public GitHub repository_. Just mention `@CodiumAI-Agent` and add the desired command in any PR comment. The agent will generate a response based on your command. For example, add a comment to any pull request with the following text: ``` @CodiumAI-Agent /review ``` and the agent will respond with a review of your PR. Note that this is a promotional bot, suitable only for initial experimentation. It does not have 'edit' access to your repo, for example, so it cannot update the PR description or add labels (`@CodiumAI-Agent /describe` will publish PR description as a comment). In addition, the bot cannot be used on private repositories, as it does not have access to the files there. ## How It Works The following diagram illustrates PR-Agent tools and their flow: ![PR-Agent Tools](https://www.qodo.ai/images/pr_agent/diagram-v0.9.png) ## Data Privacy ### Self-hosted PR-Agent - If you host PR-Agent with your OpenAI API key, it is between you and OpenAI. You can read their API data privacy policy here: https://openai.com/enterprise-privacy ## Contributing To contribute to the project, get started by reading our [Contributing Guide](https://github.com/qodo-ai/pr-agent/blob/b09eec265ef7d36c232063f76553efb6b53979ff/CONTRIBUTING.md). ## ❤️ Community This open-source release remains here as a community contribution from Qodo — the origin of modern AI-powered code collaboration. We’re proud to share it and inspire developers worldwide. The project now has its first external maintainer, Naor ([@naorpeled](https://github.com/naorpeled)), and is currently in the process of being donated to an open-source foundation. ================================================ FILE: RELEASE_NOTES.md ================================================ ## [Version 0.11] - 2023-12-07 - codiumai/pr-agent:0.11 - codiumai/pr-agent:0.11-github_app - codiumai/pr-agent:0.11-bitbucket-app - codiumai/pr-agent:0.11-gitlab_webhook - codiumai/pr-agent:0.11-github_polling - codiumai/pr-agent:0.11-github_action ### Added::Algo - New section in `/describe` tool - [PR changes walkthrough](https://github.com/Codium-ai/pr-agent/pull/509) - Improving PR Agent [prompts](https://github.com/Codium-ai/pr-agent/pull/501) - Persistent tools (`/review`, `/describe`) now send an [update message](https://github.com/Codium-ai/pr-agent/pull/499) after finishing - Add Amazon Bedrock [support](https://github.com/Codium-ai/pr-agent/pull/483) ### Fixed - Update [dependencies](https://github.com/Codium-ai/pr-agent/pull/503) in requirements.txt for Python 3.12 ## [Version 0.10] - 2023-11-15 - codiumai/pr-agent:0.10 - codiumai/pr-agent:0.10-github_app - codiumai/pr-agent:0.10-bitbucket-app - codiumai/pr-agent:0.10-gitlab_webhook - codiumai/pr-agent:0.10-github_polling - codiumai/pr-agent:0.10-github_action ### Added::Algo - Review tool now works with [persistent comments](https://github.com/Codium-ai/pr-agent/pull/451) by default - Bitbucket now publishes review suggestions with [code links](https://github.com/Codium-ai/pr-agent/pull/428) - Enabling to limit [max number of tokens](https://github.com/Codium-ai/pr-agent/pull/437/files) - Support ['gpt-4-1106-preview'](https://github.com/Codium-ai/pr-agent/pull/437/files) model - Support for Google's [Vertex AI](https://github.com/Codium-ai/pr-agent/pull/436) - Implementing [thresholds](https://github.com/Codium-ai/pr-agent/pull/423) for incremental PR reviews - Decoupled custom labels from [PR type](https://github.com/Codium-ai/pr-agent/pull/431) ### Fixed - Fixed bug in [parsing quotes](https://github.com/Codium-ai/pr-agent/pull/446) in CLI - Preserve [user-added labels](https://github.com/Codium-ai/pr-agent/pull/433) in pull requests - Bug fixes in GitLab and BitBucket ## [Version 0.9] - 2023-10-29 - codiumai/pr-agent:0.9 - codiumai/pr-agent:0.9-github_app - codiumai/pr-agent:0.9-bitbucket-app - codiumai/pr-agent:0.9-gitlab_webhook - codiumai/pr-agent:0.9-github_polling - codiumai/pr-agent:0.9-github_action ### Added::Algo - New tool - [generate_labels](https://github.com/Codium-ai/pr-agent/blob/main/docs/GENERATE_CUSTOM_LABELS.md) - New ability to use [customize labels](https://github.com/Codium-ai/pr-agent/blob/main/docs/GENERATE_CUSTOM_LABELS.md#how-to-enable-custom-labels) on the `review` and `describe` tools. - New tool - [add_docs](https://github.com/Codium-ai/pr-agent/blob/main/docs/ADD_DOCUMENTATION.md) - GitHub Action: Can now use a `.pr_agent.toml` file to control configuration parameters (see [Usage Guide](./Usage.md#working-with-github-action)). - GitHub App: Added ability to trigger tools on [push events](https://github.com/Codium-ai/pr-agent/blob/main/Usage.md#github-app-automatic-tools-for-new-code-pr-push) - Support custom domain URLs for Azure devops integration (see [link](https://github.com/Codium-ai/pr-agent/pull/381)). - PR Description default mode is now in [bullet points](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L35). ### Added::Documentation Significant documentation updates (see [Installation Guide](https://github.com/Codium-ai/pr-agent/blob/main/INSTALL.md), [Usage Guide](https://github.com/Codium-ai/pr-agent/blob/main/Usage.md), and [Tools Guide](https://github.com/Codium-ai/pr-agent/blob/main/docs/TOOLS_GUIDE.md)) ### Fixed - Fixed support for BitBucket pipeline (see [link](https://github.com/Codium-ai/pr-agent/pull/386)) - Fixed a bug in `review -i` tool - Added blacklist for specific file extensions in `add_docs` tool (see [link](https://github.com/Codium-ai/pr-agent/pull/385/)) ## [Version 0.8] - 2023-09-27 - codiumai/pr-agent:0.8 - codiumai/pr-agent:0.8-github_app - codiumai/pr-agent:0.8-bitbucket-app - codiumai/pr-agent:0.8-gitlab_webhook - codiumai/pr-agent:0.8-github_polling - codiumai/pr-agent:0.8-github_action ### Added::Algo - GitHub Action: Can control which tools will run automatically when a new PR is created. (see usage guide: https://github.com/Codium-ai/pr-agent/blob/main/Usage.md#working-with-github-action) - Code suggestion tool: Will try to avoid an 'add comments' suggestion (see https://github.com/Codium-ai/pr-agent/pull/327) ### Fixed - Gitlab: Fixed a bug of improper usage of pr_id ## [Version 0.7] - 2023-09-20 ### Docker Tags - codiumai/pr-agent:0.7 - codiumai/pr-agent:0.7-github_app - codiumai/pr-agent:0.7-bitbucket-app - codiumai/pr-agent:0.7-gitlab_webhook - codiumai/pr-agent:0.7-github_polling - codiumai/pr-agent:0.7-github_action ### Added::Algo - New tool /similar_issue - Currently on GitHub app and CLI: indexes the issues in the repo, find the most similar issues to the target issue. - Describe markers: Empower the /describe tool with a templating capability (see more details in https://github.com/Codium-ai/pr-agent/pull/273). - New feature in the /review tool - added an estimated effort estimation to the review (https://github.com/Codium-ai/pr-agent/pull/306). ### Added::Infrastructure - Implementation of a GitLab webhook. - Implementation of a BitBucket app. ### Fixed - Protection against no code suggestions generated. - Resilience to repositories where the languages cannot be automatically detected. ================================================ FILE: SECURITY.md ================================================ # Security Policy PR-Agent is an open-source tool to help efficiently review and handle pull requests. Qodo Merge is a paid version of PR-Agent, designed for companies and teams that require additional features and capabilities. This document describes the security policy of PR-Agent. For Qodo Merge's security policy, see [here](https://qodo-merge-docs.qodo.ai/overview/data_privacy/#qodo-merge). ## PR-Agent Self-Hosted Solutions When using PR-Agent with your OpenAI (or other LLM provider) API key, the security relationship is directly between you and the provider. We do not send your code to Qodo servers. Types of [self-hosted solutions](https://qodo-merge-docs.qodo.ai/installation): - Locally - GitHub integration - GitLab integration - BitBucket integration - Azure DevOps integration ## PR-Agent Supported Versions This section outlines which versions of PR-Agent are currently supported with security updates. ### Docker Deployment Options #### Latest Version For the most recent updates, use our latest Docker image which is automatically built nightly: ```yaml uses: qodo-ai/pr-agent@main ``` #### Specific Release Version For a fixed version, you can pin your action to a specific release version. Browse available releases at: [PR-Agent Releases](https://github.com/qodo-ai/pr-agent/releases) For example, to github action: ```yaml steps: - name: PR Agent action step id: pragent uses: docker://codiumai/pr-agent:0.26-github_action ``` #### Enhanced Security with Docker Digest For maximum security, you can specify the Docker image using its digest: ```yaml steps: - name: PR Agent action step id: pragent uses: docker://codiumai/pr-agent@sha256:14165e525678ace7d9b51cda8652c2d74abb4e1d76b57c4a6ccaeba84663cc64 ``` ## Reporting a Vulnerability We take the security of PR-Agent seriously. If you discover a security vulnerability, please report it immediately to: Email: security@qodo.ai Please include a description of the vulnerability, steps to reproduce, and the affected PR-Agent version. ================================================ FILE: action.yaml ================================================ name: 'Codium PR Agent' description: 'Summarize, review and suggest improvements for pull requests' branding: icon: 'award' color: 'green' runs: using: 'docker' image: 'Dockerfile.github_action_dockerhub' ================================================ FILE: codecov.yml ================================================ comment: false coverage: status: patch: false project: false ================================================ FILE: docker/Dockerfile ================================================ FROM python:3.12.10-slim AS base RUN apt update && apt install --no-install-recommends -y git curl && apt-get clean && rm -rf /var/lib/apt/lists/* WORKDIR /app ADD pyproject.toml . ADD requirements.txt . ADD docs docs RUN pip install --no-cache-dir . && rm pyproject.toml requirements.txt ENV PYTHONPATH=/app FROM base AS github_app ADD pr_agent pr_agent CMD ["python", "-m", "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "-c", "pr_agent/servers/gunicorn_config.py", "--forwarded-allow-ips", "*", "pr_agent.servers.github_app:app"] FROM base AS bitbucket_app ADD pr_agent pr_agent CMD ["python", "pr_agent/servers/bitbucket_app.py"] FROM base AS bitbucket_server_webhook ADD pr_agent pr_agent CMD ["python", "pr_agent/servers/bitbucket_server_webhook.py"] FROM base AS github_polling ADD pr_agent pr_agent CMD ["python", "pr_agent/servers/github_polling.py"] FROM base AS gitlab_webhook ADD pr_agent pr_agent CMD ["python", "pr_agent/servers/gitlab_webhook.py"] FROM base AS azure_devops_webhook ADD pr_agent pr_agent CMD ["python", "pr_agent/servers/azuredevops_server_webhook.py"] FROM base AS gitea_app ADD pr_agent pr_agent CMD ["python", "-m", "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "-c", "pr_agent/servers/gunicorn_config.py","pr_agent.servers.gitea_app:app"] FROM base AS test ADD requirements-dev.txt . RUN pip install --no-cache-dir -r requirements-dev.txt && rm requirements-dev.txt ADD pr_agent pr_agent ADD tests tests FROM base AS cli ADD pr_agent pr_agent ENTRYPOINT ["python", "pr_agent/cli.py"] ================================================ FILE: docker/Dockerfile.lambda ================================================ FROM public.ecr.aws/lambda/python:3.12 AS base RUN dnf update -y && \ dnf install -y gcc python3-devel git && \ dnf clean all ADD pyproject.toml requirements.txt ./ RUN pip install --no-cache-dir . && rm pyproject.toml RUN pip install --no-cache-dir mangum==0.17.0 COPY pr_agent/ ${LAMBDA_TASK_ROOT}/pr_agent/ FROM base AS github_lambda CMD ["pr_agent.servers.github_lambda_webhook.lambda_handler"] FROM base AS gitlab_lambda CMD ["pr_agent.servers.gitlab_lambda_webhook.lambda_handler"] FROM github_lambda ================================================ FILE: docs/README.md ================================================ # [Visit Our Docs Portal](https://qodo-merge-docs.qodo.ai/) ================================================ FILE: docs/docs/.gitbook.yaml ================================================ root: ./ structure: readme: ../README.md summary: ./summary.md ================================================ FILE: docs/docs/CNAME ================================================ qodo-merge-docs.qodo.ai ================================================ FILE: docs/docs/core-abilities/compression_strategy.md ================================================ `Supported Git Platforms: GitHub, GitLab, Bitbucket` ## Overview There are two scenarios: 1. The PR is small enough to fit in a single prompt (including system and user prompt) 2. The PR is too large to fit in a single prompt (including system and user prompt) For both scenarios, we first use the following strategy #### Repo language prioritization strategy We prioritize the languages of the repo based on the following criteria: 1. Exclude binary files and non code files (e.g. images, pdfs, etc) 2. Given the main languages used in the repo 3. We sort the PR files by the most common languages in the repo (in descending order): * ```[[file.py, file2.py],[file3.js, file4.jsx],[readme.md]]``` ### Small PR In this case, we can fit the entire PR in a single prompt: 1. Exclude binary files and non code files (e.g. images, pdfs, etc) 2. We Expand the surrounding context of each patch to 3 lines above and below the patch ### Large PR #### Motivation Pull Requests can be very long and contain a lot of information with varying degree of relevance to the pr-agent. We want to be able to pack as much information as possible in a single LMM prompt, while keeping the information relevant to the pr-agent. #### Compression strategy We prioritize additions over deletions: * Combine all deleted files into a single list (`deleted files`) * File patches are a list of hunks, remove all hunks of type deletion-only from the hunks in the file patch #### Adaptive and token-aware file patch fitting We use [tiktoken](https://github.com/openai/tiktoken) to tokenize the patches after the modifications described above, and we use the following strategy to fit the patches into the prompt: 1. Within each language we sort the files by the number of tokens in the file (in descending order): * ```[[file2.py, file.py],[file4.jsx, file3.js],[readme.md]]``` 2. Iterate through the patches in the order described above 3. Add the patches to the prompt until the prompt reaches a certain buffer from the max token length 4. If there are still patches left, add the remaining patches as a list called `other modified files` to the prompt until the prompt reaches the max token length (hard stop), skip the rest of the patches. 5. If we haven't reached the max token length, add the `deleted files` to the prompt until the prompt reaches the max token length (hard stop), skip the rest of the patches. #### Example ![Core Abilities](https://codium.ai/images/git_patch_logic.png){width=768} ================================================ FILE: docs/docs/core-abilities/dynamic_context.md ================================================ `Supported Git Platforms: GitHub, GitLab, Bitbucket` PR-Agent uses an **asymmetric and dynamic context strategy** to improve AI analysis of code changes in pull requests. It provides more context before changes than after, and dynamically adjusts the context based on code structure (e.g., enclosing functions or classes). This approach balances providing sufficient context for accurate analysis, while avoiding needle-in-the-haystack information overload that could degrade AI performance or exceed token limits. ## Introduction Pull request code changes are retrieved in a unified diff format, showing three lines of context before and after each modified section, with additions marked by '+' and deletions by '-'. ```diff @@ -12,5 +12,5 @@ def func1(): code line that already existed in the file... code line that already existed in the file... code line that already existed in the file.... -code line that was removed in the PR +new code line added in the PR code line that already existed in the file... code line that already existed in the file... code line that already existed in the file... @@ -26,2 +26,4 @@ def func2(): ... ``` This unified diff format can be challenging for AI models to interpret accurately, as it provides limited context for understanding the full scope of code changes. The presentation of code using '+', '-', and ' ' symbols to indicate additions, deletions, and unchanged lines respectively also differs from the standard code formatting typically used to train AI models. ## Challenges of expanding the context window While expanding the context window is technically feasible, it presents a more fundamental trade-off: Pros: - Enhanced context allows the model to better comprehend and localize the code changes, results (potentially) in more precise analysis and suggestions. Without enough context, the model may struggle to understand the code changes and provide relevant feedback. Cons: - Excessive context may overwhelm the model with extraneous information, creating a "needle in a haystack" scenario where focusing on the relevant details (the code that actually changed) becomes challenging. LLM quality is known to degrade when the context gets larger. Pull requests often encompass multiple changes across many files, potentially spanning hundreds of lines of modified code. This complexity presents a genuine risk of overwhelming the model with excessive context. - Increased context expands the token count, increasing processing time and cost, and may prevent the model from processing the entire pull request in a single pass. ## Asymmetric and dynamic context To address these challenges, PR-Agent employs an **asymmetric** and **dynamic** context strategy, providing the model with more focused and relevant context information for each code change. **Asymmetric:** We start by recognizing that the context preceding a code change is typically more crucial for understanding the modification than the context following it. Consequently, PR-Agent implements an asymmetric context policy, decoupling the context window into two distinct segments: one for the code before the change and another for the code after. By independently adjusting each context window, PR-Agent can supply the model with a more tailored and pertinent context for individual code changes. **Dynamic:** We also employ a "dynamic" context strategy. We start by recognizing that the optimal context for a code change often corresponds to its enclosing code component (e.g., function, class), rather than a fixed number of lines. Consequently, we dynamically adjust the context window based on the code's structure, ensuring the model receives the most pertinent information for each modification. To prevent overwhelming the model with excessive context, we impose a limit on the number of lines searched when identifying the enclosing component. This balance allows for comprehensive understanding while maintaining efficiency and limiting context token usage. ## Appendix - relevant configuration options ```toml [config] patch_extension_skip_types =[".md",".txt"] # Skip files with these extensions when trying to extend the context allow_dynamic_context=true # Allow dynamic context extension max_extra_lines_before_dynamic_context = 8 # will try to include up to X extra lines before the hunk in the patch, until we reach an enclosing function or class patch_extra_lines_before = 3 # Number of extra lines (+3 default ones) to include before each hunk in the patch patch_extra_lines_after = 1 # Number of extra lines (+3 default ones) to include after each hunk in the patch ``` ================================================ FILE: docs/docs/core-abilities/fetching_ticket_context.md ================================================ # Fetching Ticket Context for PRs `Supported Git Platforms: GitHub, GitLab, Bitbucket` !!! note "Branch-name issue linking: GitHub only (for now)" Extracting issue links from the **branch name** (and the optional `branch_issue_regex` setting) is currently implemented for **GitHub only**. Support for GitLab, Bitbucket, and other platforms is planned for a later release. The GitHub flow was the most relevant to implement first; other providers will follow. ## Overview PR-Agent streamlines code review workflows by seamlessly connecting with multiple ticket management systems. This integration enriches the review process by automatically surfacing relevant ticket information and context alongside code changes. **Ticket systems supported**: - [GitHub/Gitlab Issues](#githubgitlab-issues-integration) - [Jira](#jira-integration) **Ticket data fetched:** 1. Ticket Title 2. Ticket Description 3. Custom Fields (Acceptance criteria) 4. Subtasks (linked tasks) 5. Labels 6. Attached Images/Screenshots ## Affected Tools Ticket Recognition Requirements: - The PR description should contain a link to the ticket or if the branch name starts with the ticket id / number. - For Jira tickets, you should follow the instructions in [Jira Integration](#jira-integration) in order to authenticate with Jira. ### Describe tool PR-Agent will recognize the ticket and use the ticket content (title, description, labels) to provide additional context for the code changes. By understanding the reasoning and intent behind modifications, the LLM can offer more insightful and relevant code analysis. ### Review tool Similarly to the `describe` tool, the `review` tool will use the ticket content to provide additional context for the code changes. In addition, this feature will evaluate how well a Pull Request (PR) adheres to its original purpose/intent as defined by the associated ticket or issue mentioned in the PR description. Each ticket will be assigned a label (Compliance/Alignment level), Indicates the degree to which the PR fulfills its original purpose: - Fully Compliant - Partially Compliant - Not Compliant - PR Code Verified ![Ticket Compliance](https://www.qodo.ai/images/pr_agent/ticket_compliance_review.png){width=768} A `PR Code Verified` label indicates the PR code meets ticket requirements, but requires additional manual testing beyond the code scope. For example - validating UI display across different environments (Mac, Windows, mobile, etc.). #### Configuration options - By default, the `review` tool will automatically validate if the PR complies with the referenced ticket. If you want to disable this feedback, add the following line to your configuration file: ```toml [pr_reviewer] require_ticket_analysis_review=false ``` - If you set: ```toml [pr_reviewer] check_pr_additional_content=true ``` (default: `false`) the `review` tool will also validate that the PR code doesn't contain any additional content that is not related to the ticket. If it does, the PR will be labeled at best as `PR Code Verified`, and the `review` tool will provide a comment with the additional unrelated content found in the PR code. ## GitHub/Gitlab Issues Integration PR-Agent will automatically recognize GitHub/Gitlab issues mentioned in the PR description and fetch the issue content. Examples of valid GitHub/Gitlab issue references: - `https://github.com///issues/` or `https://gitlab.com///-/issues/` - `#` - `/#` Branch names can also be used to link issues, for example: - `123-fix-bug` (where `123` is the issue number) This branch-name detection applies **only when the git provider is GitHub**. Support for other platforms is planned for later. Since PR-Agent is integrated with GitHub, it doesn't require any additional configuration to fetch GitHub issues. ## Jira Integration We support both Jira Cloud and Jira Server/Data Center. ### Jira Cloud #### Email/Token Authentication You can create an API token from your Atlassian account: 1. Log in to https://id.atlassian.com/manage-profile/security/api-tokens. 2. Click Create API token. 3. From the dialog that appears, enter a name for your new token and click Create. 4. Click Copy to clipboard. ![Jira Cloud API Token](https://images.ctfassets.net/zsv3d0ugroxu/1RYvh9lqgeZjjNe5S3Hbfb/155e846a1cb38f30bf17512b6dfd2229/screenshot_NewAPIToken){width=384} 5. In your [configuration file](../usage-guide/configuration_options.md) add the following lines: ```toml [jira] jira_api_token = "YOUR_API_TOKEN" jira_api_email = "YOUR_EMAIL" ``` ### Jira Data Center/Server #### Using Basic Authentication for Jira Data Center/Server You can use your Jira username and password to authenticate with Jira Data Center/Server. In your Configuration file/Environment variables/Secrets file, add the following lines: ```toml jira_api_email = "your_username" jira_api_token = "your_password" ``` (Note that indeed the 'jira_api_email' field is used for the username, and the 'jira_api_token' field is used for the user password.) ##### Validating Basic authentication via Python script If you are facing issues retrieving tickets in PR-Agent with Basic auth, you can validate the flow using a Python script. This following steps will help you check if the basic auth is working correctly, and if you can access the Jira ticket details: 1. run `pip install jira==3.8.0` 2. run the following Python script (after replacing the placeholders with your actual values): ???- example "Script to validate basic auth" ```python from jira import JIRA if __name__ == "__main__": try: # Jira server URL server = "https://..." # Basic auth username = "..." password = "..." # Jira ticket code (e.g. "PROJ-123") ticket_id = "..." print("Initializing JiraServerTicketProvider with JIRA server") # Initialize JIRA client jira = JIRA( server=server, basic_auth=(username, password), timeout=30 ) if jira: print(f"JIRA client initialized successfully") else: print("Error initializing JIRA client") # Fetch ticket details ticket = jira.issue(ticket_id) print(f"Ticket title: {ticket.fields.summary}") except Exception as e: print(f"Error fetching JIRA ticket details: {e}") ``` #### Using a Personal Access Token (PAT) for Jira Data Center/Server 1. Create a [Personal Access Token (PAT)](https://confluence.atlassian.com/enterprise/using-personal-access-tokens-1026032365.html) in your Jira account 2. In your Configuration file/Environment variables/Secrets file, add the following lines: ```toml [jira] jira_base_url = "YOUR_JIRA_BASE_URL" # e.g. https://jira.example.com jira_api_token = "YOUR_API_TOKEN" ``` ##### Validating PAT token via Python script If you are facing issues retrieving tickets in PR-Agent with PAT token, you can validate the flow using a Python script. This following steps will help you check if the token is working correctly, and if you can access the Jira ticket details: 1. run `pip install jira==3.8.0` 2. run the following Python script (after replacing the placeholders with your actual values): ??? example- "Script to validate PAT token" ```python from jira import JIRA if __name__ == "__main__": try: # Jira server URL server = "https://..." # Jira PAT token token_auth = "..." # Jira ticket code (e.g. "PROJ-123") ticket_id = "..." print("Initializing JiraServerTicketProvider with JIRA server") # Initialize JIRA client jira = JIRA( server=server, token_auth=token_auth, timeout=30 ) if jira: print(f"JIRA client initialized successfully") else: print("Error initializing JIRA client") # Fetch ticket details ticket = jira.issue(ticket_id) print(f"Ticket title: {ticket.fields.summary}") except Exception as e: print(f"Error fetching JIRA ticket details: {e}") ``` ### Multi-JIRA Server Configuration PR-Agent supports connecting to multiple JIRA servers using different authentication methods. === "Email/Token (Basic Auth)" Configure multiple servers using Email/Token authentication: - `jira_servers`: List of JIRA server URLs - `jira_api_token`: List of API tokens (for Cloud) or passwords (for Data Center) - `jira_api_email`: List of emails (for Cloud) or usernames (for Data Center) - `jira_base_url`: Default server for ticket IDs like `PROJ-123`, Each repository can configure (local config file) its own `jira_base_url` to choose which server to use by default. **Example Configuration:** ```toml [jira] # Server URLs jira_servers = ["https://company.atlassian.net", "https://datacenter.jira.com"] # API tokens/passwords jira_api_token = ["cloud_api_token_here", "datacenter_password"] # Emails/usernames (both required) jira_api_email = ["user@company.com", "datacenter_username"] # Default server for ticket IDs jira_base_url = "https://company.atlassian.net" ``` === "PAT Auth" Configure multiple servers using Personal Access Token authentication: - `jira_servers`: List of JIRA server URLs - `jira_api_token`: List of PAT tokens - `jira_api_email`: Not needed (can be omitted or left empty) - `jira_base_url`: Default server for ticket IDs like `PROJ-123`, Each repository can configure (local config file) its own `jira_base_url` to choose which server to use by default. **Example Configuration:** ```toml [jira] # Server URLs jira_servers = ["https://server1.jira.com", "https://server2.jira.com"] # PAT tokens only jira_api_token = ["pat_token_1", "pat_token_2"] # Default server for ticket IDs jira_base_url = "https://server1.jira.com" ``` **Mixed Authentication (Email/Token + PAT):** ```toml [jira] jira_servers = ["https://company.atlassian.net", "https://server.jira.com"] jira_api_token = ["cloud_api_token", "server_pat_token"] jira_api_email = ["user@company.com", ""] # Empty for PAT ``` ### How to link a PR to a Jira ticket To integrate with Jira, you can link your PR to a ticket using either of these methods: **Method 1: Description Reference:** Include a ticket reference in your PR description, using either the complete URL format `https://.atlassian.net/browse/ISSUE-123` or the shortened ticket ID `ISSUE-123` (without prefix or suffix for the shortened ID). **Method 2: Branch Name Detection:** Name your branch with the ticket ID as a prefix (e.g., `ISSUE-123-feature-description` or `ISSUE-123/feature-description`). !!! note "Jira Base URL" For shortened ticket IDs or branch detection (method 2 for JIRA cloud), you must configure the Jira base URL in your configuration file under the [jira] section: ```toml [jira] jira_base_url = "https://.atlassian.net" ``` Where `` is your Jira organization identifier (e.g., `mycompany` for `https://mycompany.atlassian.net`). ================================================ FILE: docs/docs/core-abilities/index.md ================================================ # Core Abilities PR-Agent utilizes a variety of core abilities to provide a comprehensive and efficient code review experience. These abilities include: - [Compression strategy](./compression_strategy.md) - [Dynamic context](./dynamic_context.md) - [Fetching ticket context](./fetching_ticket_context.md) - [Interactivity](./interactivity.md) - [Local and global metadata](./metadata.md) - [Self-reflection](./self_reflection.md) ## Blogs Here are some additional technical blogs from Qodo, that delve deeper into the core capabilities and features of Large Language Models (LLMs) when applied to coding tasks. These resources provide more comprehensive insights into leveraging LLMs for software development. ### Code Generation and LLMs - [Effective AI code suggestions: less is more](https://www.codium.ai/blog/effective-code-suggestions-llms-less-is-more/) - [State-of-the-art Code Generation with AlphaCodium – From Prompt Engineering to Flow Engineering](https://www.codium.ai/blog/qodoflow-state-of-the-art-code-generation-for-code-contests/) - [RAG for a Codebase with 10k Repos](https://www.codium.ai/blog/rag-for-large-scale-code-repos/) ### Development Processes - [Understanding the Challenges and Pain Points of the Pull Request Cycle](https://www.codium.ai/blog/understanding-the-challenges-and-pain-points-of-the-pull-request-cycle/) - [Introduction to Code Coverage Testing](https://www.codium.ai/blog/introduction-to-code-coverage-testing/) ### Cost Optimization - [Reduce Your Costs by 30% When Using GPT for Python Code](https://www.codium.ai/blog/reduce-your-costs-by-30-when-using-gpt-3-for-python-code/) ================================================ FILE: docs/docs/core-abilities/interactivity.md ================================================ # Interactivity `Supported Git Platforms: GitHub, GitLab` ## Overview PR-Agent transforms static code reviews into interactive experiences by enabling direct actions from pull request (PR) comments. Developers can immediately trigger actions and apply changes with simple checkbox clicks. This focused workflow maintains context while dramatically reducing the time between PR creation and final merge. The approach eliminates manual steps, provides clear visual indicators, and creates immediate feedback loops all within the same interface. ## Key Interactive Features ### 1\. Interactive `/improve` Tool The [`/improve`](../tools/improve.md) command delivers a comprehensive interactive experience: - _**Apply this suggestion**_: Clicking this checkbox instantly converts a suggestion into a committable code change. When committed to the PR, changes made to code that was flagged for improvement will be marked with a check mark, allowing developers to easily track and review implemented recommendations. - _**More**_: Triggers additional suggestions generation while keeping each suggestion focused and relevant as the original set - _**Update**_: Triggers a re-analysis of the code, providing updated suggestions based on the latest changes - _**Author self-review**_: Interactive acknowledgment that developers have opened and reviewed collapsed suggestions ### 2\. Interactive `/help` Tool The [`/help`](../tools/help.md) command not only lists available tools and their descriptions but also enables immediate tool invocation through interactive checkboxes. When a user checks a tool's checkbox, PR-Agent instantly triggers that tool without requiring additional commands. This transforms the standard help menu into an interactive launch pad for all PR-Agent capabilities, eliminating context switching by keeping developers within their PR workflow. ================================================ FILE: docs/docs/core-abilities/metadata.md ================================================ # Local and global metadata injection with multi-stage analysis `Supported Git Platforms: GitHub, GitLab, Bitbucket` 1\. PR-Agent initially retrieves for each PR the following data: - PR title and branch name - PR original description - Commit messages history - PR diff patches, in [hunk diff](https://loicpefferkorn.net/2014/02/diff-files-what-are-hunks-and-how-to-extract-them/) format - The entire content of the files that were modified in the PR !!! tip "Tip: Organization-level metadata" In addition to the inputs above, PR-Agent can incorporate supplementary preferences provided by the user, like [`extra_instructions` and `organization best practices`](../tools/improve.md#extra-instructions-and-best-practices). This information can be used to enhance the PR analysis. 2\. By default, the first command that PR-Agent executes is [`describe`](../tools/describe.md), which generates three types of outputs: - PR Type (e.g. bug fix, feature, refactor, etc) - PR Description - a bullet point summary of the PR - Changes walkthrough - for each modified file, provide a one-line summary followed by a detailed bullet point list of the changes. These AI-generated outputs are now considered as part of the PR metadata, and can be used in subsequent commands like `review` and `improve`. This effectively enables multi-stage chain-of-thought analysis, without doing any additional API calls which will cost time and money. For example, when generating code suggestions for different files, PR-Agent can inject the AI-generated ["Changes walkthrough"](https://github.com/qodo-ai/pr-agent/pull/1202#issue-2511546839) file summary in the prompt: ```diff ## File: 'src/file1.py' ### AI-generated file summary: - edited function `func1` that does X - Removed function `func2` that was not used - .... @@ ... @@ def func1(): __new hunk__ 11 unchanged code line0 12 unchanged code line1 13 +new code line2 added 14 unchanged code line3 __old hunk__ unchanged code line0 unchanged code line1 -old code line2 removed unchanged code line3 @@ ... @@ def func2(): __new hunk__ ... __old hunk__ ... ``` 3\. The entire PR files that were retrieved are also used to expand and enhance the PR context (see [Dynamic Context](./dynamic_context.md)). 4\. All the metadata described above represents several level of cumulative analysis - ranging from hunk level, to file level, to PR level, to organization level. This comprehensive approach enables PR-Agent AI models to generate more precise and contextually relevant suggestions and feedback. ================================================ FILE: docs/docs/core-abilities/self_reflection.md ================================================ `Supported Git Platforms: GitHub, GitLab, Bitbucket` PR-Agent implements a **self-reflection** process where the AI model reflects, scores, and re-ranks its own suggestions, eliminating irrelevant or incorrect ones. This approach improves the quality and relevance of suggestions, saving users time and enhancing their experience. Configuration options allow users to set a score threshold for further filtering out suggestions. ## Introduction - Efficient Review with Hierarchical Presentation Given that not all generated code suggestions will be relevant, it is crucial to enable users to review them in a fast and efficient way, allowing quick identification and filtering of non-applicable ones. To achieve this goal, PR-Agent offers a dedicated hierarchical structure when presenting suggestions to users: - A "category" section groups suggestions by their category, allowing users to quickly dismiss irrelevant suggestions. - Each suggestion is first described by a one-line summary, which can be expanded to a full description by clicking on a collapsible. - Upon expanding a suggestion, the user receives a more comprehensive description, and a code snippet demonstrating the recommendation. !!! note "Fast Review" This hierarchical structure is designed to facilitate rapid review of each suggestion, with users spending an average of ~5-10 seconds per item. ## Self-reflection and Re-ranking The AI model is initially tasked with generating suggestions, and outputting them in order of importance. However, in practice we observe that models often struggle to simultaneously generate high-quality code suggestions and rank them well in a single pass. Furthermore, the initial set of generated suggestions sometimes contains easily identifiable errors. To address these issues, we implemented a "self-reflection" process that refines suggestion ranking and eliminates irrelevant or incorrect proposals. This process consists of the following steps: 1. Presenting the generated suggestions to the model in a follow-up call. 2. Instructing the model to score each suggestion on a scale of 0-10 and provide a rationale for the assigned score. 3. Utilizing these scores to re-rank the suggestions and filter out incorrect ones (with a score of 0). 4. Optionally, filtering out all suggestions below a user-defined score threshold. Note that presenting all generated suggestions simultaneously provides the model with a comprehensive context, enabling it to make more informed decisions compared to evaluating each suggestion individually. To conclude, the self-reflection process enables PR-Agent to prioritize suggestions based on their importance, eliminate inaccurate or irrelevant proposals, and optionally exclude suggestions that fall below a specified threshold of significance. This results in a more refined and valuable set of suggestions for the user, saving time and improving the overall experience. ## Example Results ![self_reflection](https://codium.ai/images/pr_agent/self_reflection1.png){width=768} ![self_reflection](https://codium.ai/images/pr_agent/self_reflection2.png){width=768} ## Appendix - Relevant Configuration Options ```toml [pr_code_suggestions] suggestions_score_threshold = 0 # Filter out suggestions with a score below this threshold (0-10) ``` ================================================ FILE: docs/docs/css/custom.css ================================================ /* Neutral color scheme - ready for future branding */ :root { --md-primary-fg-color: #0f172a; --md-accent-fg-color: #1d4ed8; --md-typeset-a-color: #1e40af; } [data-md-color-scheme="slate"] { --md-primary-fg-color: #0b1220; --md-accent-fg-color: #38bdf8; --md-typeset-a-color: #7dd3fc; --md-default-bg-color: #0b1220; --md-default-fg-color: #e5e7eb; --md-default-fg-color--light: rgba(229, 231, 235, 0.7); --md-default-fg-color--lighter: rgba(229, 231, 235, 0.5); --md-default-fg-color--lightest: rgba(229, 231, 235, 0.3); --md-code-bg-color: #0f172a; } .md-nav--primary { .md-nav__link { font-size: 18px; } } .md-nav--primary { position: relative; } .md-nav--primary::before { content: ""; position: absolute; top: 0; right: 10px; width: 2px; height: 100%; background-color: #e5e7eb; } [data-md-color-scheme="slate"] .md-nav--primary::before { background-color: #1f2937; } [data-md-color-scheme="slate"] .md-header { background-color: #0d1b36; } [data-md-color-scheme="slate"] .md-tabs { background-color: #0b1220; border-top: 1px solid rgba(148, 163, 184, 0.25); } [data-md-color-scheme="slate"] .md-tabs__link { color: #e2e8f0; } [data-md-color-scheme="slate"] .md-tabs__link--active, [data-md-color-scheme="slate"] .md-tabs__link:hover { color: #ffffff; text-decoration: underline; text-underline-offset: 0.25rem; } [data-md-color-scheme="slate"] .md-search__form { background-color: #0f172a; border: 1px solid rgba(148, 163, 184, 0.4); } [data-md-color-scheme="slate"] .md-search__input { color: #e2e8f0; } [data-md-color-scheme="slate"] .md-search__input::placeholder { color: rgba(226, 232, 240, 0.7); } [data-md-color-scheme="slate"] .md-search__icon { color: rgba(226, 232, 240, 0.85); } .md-tabs__link { font-size: 18px; } .md-header__title { font-size: 20px; margin-left: 12px !important; } .md-header__button.md-logo, .md-nav__title .md-logo { display: none; } .md-content img { border-width: 1px; border-style: solid; border-color: rgba(15, 23, 42, 0.2); outline-width: 1px; outline-style: solid; outline-color: rgba(15, 23, 42, 0.25); } [data-md-color-scheme="slate"] .md-content img { border-color: rgba(226, 232, 240, 0.2); outline-color: rgba(226, 232, 240, 0.3); } .md-banner { background-color: #1d4ed8; } [data-md-color-scheme="slate"] .md-banner { background-color: #2563eb; } .md-banner .md-typeset a, .md-banner .md-typeset a:hover, .md-banner .md-typeset a:focus { color: currentColor; text-decoration: underline; } ================================================ FILE: docs/docs/faq/index.md ================================================ # FAQ ??? note "Q: Can PR-Agent serve as a substitute for a human reviewer?" #### Answer:1 PR-Agent is designed to assist, not replace, human reviewers. Reviewing PRs is a tedious and time-consuming task often seen as a "chore". In addition, the longer the PR – the shorter the relative feedback, since long PRs can overwhelm reviewers, both in terms of technical difficulty, and the actual review time. PR-Agent aims to address these pain points, and to assist and empower both the PR author and reviewer. However, PR-Agent has built-in safeguards to ensure the developer remains in the driver's seat. For example: 1. Preserves user's original PR header 2. Places user's description above the AI-generated PR description 3. Won't approve PRs; approval remains reviewer's responsibility 4. The code suggestions are optional, and aim to: - Encourage self-review and self-reflection - Highlight potential bugs or oversights - Enhance code quality and promote best practices Read more about this issue in our [blog](https://www.qodo.ai/blog/understanding-the-challenges-and-pain-points-of-the-pull-request-cycle/) ___ ??? note "Q: I received an incorrect or irrelevant suggestion. Why?" #### Answer:2 - Modern AI models, like Claude Sonnet and GPT-5, are improving rapidly but remain imperfect. Users should critically evaluate all suggestions rather than accepting them automatically. - AI errors are rare, but possible. A main value from reviewing the code suggestions lies in their high probability of catching **mistakes or bugs made by the PR author**. We believe it's worth spending 30-60 seconds reviewing suggestions, even if some aren't relevant, as this practice can enhance code quality and prevent bugs in production. - The hierarchical structure of the suggestions is designed to help the user _quickly_ understand them, and to decide which ones are relevant and which are not: - Only if the `Category` header is relevant, the user should move to the summarized suggestion description. - Only if the summarized suggestion description is relevant, the user should click on the collapsible, to read the full suggestion description with a code preview example. - In addition, we recommend to use the [`extra_instructions`](../tools/improve.md#extra-instructions-and-best-practices) field to guide the model to suggestions that are more relevant to the specific needs of the project. ___ ??? note "Q: How can I get more tailored suggestions?" #### Answer:3 See [here](../tools/improve.md#extra-instructions-and-best-practices) for more information on how to use the `extra_instructions` and `best_practices` configuration options, to guide the model to more tailored suggestions. ___ ??? note "Q: Will you store my code? Are you using my code to train models?" #### Answer:4 No. PR-Agent strict privacy policy ensures that your code is not stored or used for training purposes. For a detailed overview of our data privacy policy, please refer to [this link](../overview/data_privacy.md) ___ ??? note "Q: Can PR-Agent review draft/offline PRs?" #### Answer:6 Yes. While PR-Agent won't automatically review draft PRs, you can still get feedback by manually requesting it through [online commenting](../usage-guide/automations_and_usage.md#online-usage). For active PRs, you can customize the automatic feedback settings [here](../usage-guide/automations_and_usage.md#pr-agent-automatic-feedback) to match your team's workflow. ___ ??? note "Q: Can the 'Review effort' feedback be calibrated or customized?" #### Answer:7 Yes, you can customize review effort estimates using the `extra_instructions` configuration option (see [documentation](../tools/review.md#configuration-options)). Example mapping: - Effort 1: < 30 minutes review time - Effort 2: 30-60 minutes review time - Effort 3: 60-90 minutes review time - ... Note: The effort levels (1-5) are primarily meant for _comparative_ purposes, helping teams prioritize reviewing smaller PRs first. The actual review duration may vary, as the focus is on providing consistent relative effort estimates. ___ ??? note "Q: How to reduce the noise generated by PR-Agent?" #### Answer:3 The default configuration of PR-Agent is designed to balance helpful feedback with noise reduction. It reduces noise through several approaches: - Auto-feedback uses three highly structured tools (`/describe`, `/review`, and `/improve`), designed to be accessible at a glance without creating large visual overload - Suggestions are presented in a table format rather than as committable comments, which are far noisier - The 'File Walkthrough' section is folded by default, as it tends to be verbose - Intermediate comments are avoided when creating new PRs (like "PR-Agent is now reviewing your PR..."), which would generate email noise From our experience, especially in large teams or organizations, complaints about "noise" sometimes stem from the following issues: - **Feedback from multiple bots**: When multiple bots provide feedback on the same PR, it creates confusion and noise. We recommend using PR-Agent as the primary feedback tool to streamline the process and reduce redundancy. - **Getting familiar with the tool**: Unlike many tools that provide feedback only on demand, PR-Agent automatically analyzes and suggests improvements for every code change. While this proactive approach can feel intimidating at first, it's designed to continuously enhance code quality and catch bugs and problems when they occur. We recommend reviewing [this guide](../tools/improve.md#understanding-ai-code-suggestions) to help align expectations and maximize the value of PR-Agent's auto-feedback. Therefore, at a global configuration level, we recommend using the default configuration, which is designed to reduce noise while providing valuable feedback. However, if you still find the feedback too noisy, you can adjust the configuration. Since each user and team has different needs, it's definitely possible - and even recommended - to adjust configurations for specific repos as needed. Ways to adjust the configuration for noise reduction include for example: - [Score thresholds for code suggestions](../tools/improve.md#configuration-options) - [Utilizing the `extra_instructions` field for more tailored feedback](../tools/improve.md#extra-instructions) - [Controlling which tools run automatically](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened) Note that some users may prefer the opposite - more thorough and detailed feedback. PR-Agent is designed to be flexible and customizable, allowing you to tailor the feedback to your team's specific needs and preferences. Examples of ways to increase feedback include: - [Dual-publishing mode](../tools/improve.md#dual-publishing-mode) - [Interactive usage](../core-abilities/interactivity.md) ___ ================================================ FILE: docs/docs/index.md ================================================ # Overview [PR-Agent](https://github.com/qodo-ai/pr-agent) is an open-source, AI-powered code review agent and a community-maintained legacy project of Qodo. It is distinct from Qodo's primary AI code review offering, which provides a feature-rich, context-aware experience. Qodo now offers a free tier that integrates seamlessly with GitHub, GitLab, Bitbucket, and Azure DevOps for high-quality automated reviews. - See the [Installation Guide](./installation/index.md) for instructions on installing and running the tool on different git platforms. - See the [Usage Guide](./usage-guide/index.md) for instructions on running commands via different interfaces, including _CLI_, _online usage_, or by _automatically triggering_ them when a new PR is opened. - See the [Tools Guide](./tools/index.md) for a detailed description of the different tools. ## Docs Smart Search To search the documentation site using natural language: 1) Comment `/help "your question"` in a pull request where PR-Agent is installed 2) The bot will respond with an [answer](https://github.com/qodo-ai/pr-agent/pull/1241#issuecomment-2365259334) that includes relevant documentation links. ## Features PR-Agent offers comprehensive pull request functionalities integrated with various git providers: | | | GitHub | GitLab | Bitbucket | Azure DevOps | Gitea | | ----- |---------------------------------------------------------------------------------------|:------:|:------:|:---------:|:------------:|:-----:| | [TOOLS](./tools/index.md) | [Describe](./tools/describe.md) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [Review](./tools/review.md) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [Improve](./tools/improve.md) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [Ask](./tools/ask.md) | ✅ | ✅ | ✅ | ✅ | | | | ⮑ [Ask on code lines](./tools/ask.md#ask-lines) | ✅ | ✅ | | | | | | [Add Docs](./tools/add_docs.md) | ✅ | ✅ | ✅ | ✅ | | | | [Generate Labels](./tools/generate_labels.md) | ✅ | ✅ | ✅ | ✅ | | | | [Similar Issues](./tools/similar_issues.md) | ✅ | | | | | | | [Help](./tools/help.md) | ✅ | ✅ | ✅ | ✅ | | | | [Help Docs](./tools/help_docs.md) | ✅ | ✅ | ✅ | | | | | [Update CHANGELOG](./tools/update_changelog.md) | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | [USAGE](./usage-guide/index.md) | [CLI](./usage-guide/automations_and_usage.md#local-repo-cli) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [App / webhook](./usage-guide/automations_and_usage.md#github-app) | ✅ | ✅ | ✅ | ✅ | ✅ | | | [Tagging bot](https://github.com/qodo-ai/pr-agent#try-it-now) | ✅ | | | | | | | [Actions](./installation/github.md#run-as-a-github-action) | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | [CORE](./core-abilities/index.md) | [Adaptive and token-aware file patch fitting](./core-abilities/compression_strategy.md) | ✅ | ✅ | ✅ | ✅ | | | | [Chat on code suggestions](./core-abilities/interactivity.md) | ✅ | ✅ | | | | | | [Compression strategy](./core-abilities/compression_strategy.md) | ✅ | ✅ | ✅ | ✅ | | | | [Dynamic context](./core-abilities/dynamic_context.md) | ✅ | ✅ | ✅ | ✅ | | | | [Fetching ticket context](./core-abilities/fetching_ticket_context.md) | ✅ | ✅ | ✅ | | | | | [Interactivity](./core-abilities/interactivity.md) | ✅ | ✅ | | | | | | [Local and global metadata](./core-abilities/metadata.md) | ✅ | ✅ | ✅ | ✅ | | | | [Multiple models support](./usage-guide/changing_a_model.md) | ✅ | ✅ | ✅ | ✅ | | | | [Self reflection](./core-abilities/self_reflection.md) | ✅ | ✅ | ✅ | ✅ | | ## Example Results
#### [/describe](https://github.com/qodo-ai/pr-agent/pull/530)
![/describe](https://www.codium.ai/images/pr_agent/describe_new_short_main.png){width=512}

#### [/review](https://github.com/qodo-ai/pr-agent/pull/732#issuecomment-1975099151)
![/review](https://www.codium.ai/images/pr_agent/review_new_short_main.png){width=512}

#### [/improve](https://github.com/qodo-ai/pr-agent/pull/732#issuecomment-1975099159)
![/improve](https://www.codium.ai/images/pr_agent/improve_new_short_main.png){width=512}

## How it Works The following diagram illustrates PR-Agent tools and their flow: ![PR-Agent Tools](https://codium.ai/images/pr_agent/diagram-v0.9.png) Check out the [PR Compression strategy](core-abilities/index.md) page for more details on how we convert a code diff to a manageable LLM prompt ================================================ FILE: docs/docs/installation/azure.md ================================================ ## Azure DevOps Pipeline You can use a pre-built Action Docker image to run PR-Agent as an Azure DevOps pipeline. Add the following file to your repository under `azure-pipelines.yml`: ```yaml # Opt out of CI triggers trigger: none # Configure PR trigger # pr: # branches: # include: # - '*' # autoCancel: true # drafts: false # NOTE for Azure Repos Git: # Azure Repos does not honor YAML pr: triggers. Configure Build Validation # via Branch Policies instead (see note below). You can safely omit pr:. stages: - stage: pr_agent displayName: 'PR Agent Stage' jobs: - job: pr_agent_job displayName: 'PR Agent Job' pool: vmImage: 'ubuntu-latest' container: image: codiumai/pr-agent:latest options: --entrypoint "" variables: - group: pr_agent steps: - script: | echo "Running PR Agent action step" # Construct PR_URL PR_URL="${SYSTEM_COLLECTIONURI}${SYSTEM_TEAMPROJECT}/_git/${BUILD_REPOSITORY_NAME}/pullrequest/${SYSTEM_PULLREQUEST_PULLREQUESTID}" echo "PR_URL=$PR_URL" # Extract organization URL from System.CollectionUri ORG_URL=$(echo "$(System.CollectionUri)" | sed 's/\/$//') # Remove trailing slash if present echo "Organization URL: $ORG_URL" export azure_devops__org="$ORG_URL" export config__git_provider="azure" pr-agent --pr_url="$PR_URL" describe pr-agent --pr_url="$PR_URL" review pr-agent --pr_url="$PR_URL" improve env: azure_devops__pat: $(azure_devops_pat) openai__key: $(OPENAI_KEY) displayName: 'Run PR-Agent' ``` This script will run PR-Agent on every new merge request, with the `improve`, `review`, and `describe` commands. Note that you need to export the `azure_devops__pat` and `OPENAI_KEY` variables in the Azure DevOps pipeline settings (Pipelines -> Library -> + Variable group): ![PR-Agent](https://codium.ai/images/pr_agent/azure_devops_pipeline_secrets.png){width=468} Make sure to give pipeline permissions to the `pr_agent` variable group. > Note that Azure Pipelines lacks support for triggering workflows from PR comments. If you find a viable solution, please contribute it to our [issue tracker](https://github.com/qodo-ai/pr-agent/issues) ### Azure Repos Git PR triggers and Build Validation Azure Repos Git does not use YAML `pr:` triggers for pipelines. Instead, configure Build Validation on the target branch to run the PR Agent pipeline for pull requests: 1. Go to Project Settings → Repositories → Branches. 2. Select the target branch and open Branch Policies. 3. Under Build Validation, add a policy: - Select the PR Agent pipeline (the `azure-pipelines.yml` above). - Set it as Required. 4. Remove the `pr:` section from your YAML (not needed for Azure Repos Git). This distinction applies specifically to Azure Repos Git. Other providers like GitHub and Bitbucket Cloud can use YAML-based PR triggers. ## Azure DevOps from CLI To use Azure DevOps provider use the following settings in configuration.toml: ```toml [config] git_provider="azure" ``` Azure DevOps provider supports [PAT token](https://learn.microsoft.com/en-us/azure/devops/organizations/accounts/use-personal-access-tokens-to-authenticate?view=azure-devops&tabs=Windows) or [DefaultAzureCredential](https://learn.microsoft.com/en-us/azure/developer/python/sdk/authentication-overview#authentication-in-server-environments) authentication. PAT is faster to create, but has built-in expiration date, and will use the user identity for API calls. Using DefaultAzureCredential you can use managed identity or Service principle, which are more secure and will create separate ADO user identity (via AAD) to the agent. If PAT was chosen, you can assign the value in .secrets.toml. If DefaultAzureCredential was chosen, you can assigned the additional env vars like AZURE_CLIENT_SECRET directly, or use managed identity/az cli (for local development) without any additional configuration. in any case, 'org' value must be assigned in .secrets.toml: ```toml [azure_devops] org = "https://dev.azure.com/YOUR_ORGANIZATION/" # pat = "YOUR_PAT_TOKEN" needed only if using PAT for authentication ``` ## Azure DevOps Webhook To trigger from an Azure webhook, you need to manually [add a webhook](https://learn.microsoft.com/en-us/azure/devops/service-hooks/services/webhooks?view=azure-devops). Use the "Pull request created" type to trigger a review, or "Pull request commented on" to trigger any supported comment with / comment on the relevant PR. Note that for the "Pull request commented on" trigger, only API v2.0 is supported. For webhook security, create a sporadic username/password pair and configure the webhook username and password on both the server and Azure DevOps webhook. These will be sent as basic Auth data by the webhook with each request: ```toml [azure_devops_server] webhook_username = "" webhook_password = "" ``` > :warning: **Ensure that the webhook endpoint is only accessible over HTTPS** to mitigate the risk of credential interception when using basic authentication. ================================================ FILE: docs/docs/installation/bitbucket.md ================================================ ## Run as a Bitbucket Pipeline You can use the Bitbucket Pipeline system to run PR-Agent on every pull request open or update. 1. Add the following file in your repository bitbucket-pipelines.yml ```yaml pipelines: pull-requests: '**': - step: name: PR Agent Review image: codiumai/pr-agent:latest script: - pr-agent --pr_url=https://bitbucket.org/$BITBUCKET_WORKSPACE/$BITBUCKET_REPO_SLUG/pull-requests/$BITBUCKET_PR_ID review ``` 2. Add the following secure variables to your repository under Repository settings > Pipelines > Repository variables. - CONFIG__GIT_PROVIDER: `bitbucket` - OPENAI__KEY: `` - BITBUCKET__AUTH_TYPE: `basic` or `bearer` (default is `bearer`) - BITBUCKET__BEARER_TOKEN: `` (required when auth_type is bearer) - BITBUCKET__BASIC_TOKEN: `` (required when auth_type is basic) You can get a Bitbucket token for your repository by following Repository Settings -> Security -> Access Tokens. For basic auth, you can generate a base64 encoded token from your username:password combination. Note that comments on a PR are not supported in Bitbucket Pipeline. ## Bitbucket Server and Data Center Login into your on-prem instance of Bitbucket with your service account username and password. Navigate to `Manage account`, `HTTP Access tokens`, `Create Token`. Generate the token and add it to .secret.toml under `bitbucket_server` section ```toml [bitbucket_server] bearer_token = "" ``` Don't forget to also set the URL of your Bitbucket Server instance (either in `.secret.toml` or in `configuration.toml`): ```toml [bitbucket_server] url = "" ``` ### Run it as CLI Modify `configuration.toml`: ```toml git_provider="bitbucket_server" ``` and pass the Pull request URL: ```shell python cli.py --pr_url https://git.on-prem-instance-of-bitbucket.com/projects/PROJECT/repos/REPO/pull-requests/1 review ``` ### Run it as service To run PR-Agent as webhook, build the docker image: ```bash docker build . -t codiumai/pr-agent:bitbucket_server_webhook --target bitbucket_server_webhook -f docker/Dockerfile docker push codiumai/pr-agent:bitbucket_server_webhook # Push to your Docker repository ``` Navigate to `Projects` or `Repositories`, `Settings`, `Webhooks`, `Create Webhook`. Fill in the name and URL. For Authentication, select 'None'. Select the 'Pull Request Opened' checkbox to receive that event as a webhook. The URL should end with `/webhook`, for example: https://domain.com/webhook ================================================ FILE: docs/docs/installation/gitea.md ================================================ ## Run a Gitea webhook server 1. In Gitea create a new user and give it "Reporter" role for the intended group or project. 2. For the user from step 1. generate a `personal_access_token` with `api` access. 3. Generate a random secret for your app, and save it for later (`webhook_secret`). For example, you can use: ```bash WEBHOOK_SECRET=$(python -c "import secrets; print(secrets.token_hex(10))") ``` 4. Clone this repository: ```bash git clone https://github.com/qodo-ai/pr-agent.git ``` 5. Prepare variables and secrets. Skip this step if you plan on setting these as environment variables when running the agent: - In the configuration file/variables: - Set `config.git_provider` to "gitea" - In the secrets file/variables: - Set your AI model key in the respective section - In the [Gitea] section, set `personal_access_token` (with token from step 2) and `webhook_secret` (with secret from step 3) 6. Build a Docker image for the app and optionally push it to a Docker repository. We'll use Dockerhub as an example: ```bash docker build -f /docker/Dockerfile -t pr-agent:gitea_app --target gitea_app . docker push codiumai/pr-agent:gitea_webhook # Push to your Docker repository ``` 7. Set the environmental variables, the method depends on your docker runtime. Skip this step if you included your secrets/configuration directly in the Docker image. ```bash CONFIG__GIT_PROVIDER=gitea GITEA__PERSONAL_ACCESS_TOKEN= GITEA__WEBHOOK_SECRET= GITEA__URL=https://gitea.com # Or self host OPENAI__KEY= GITEA__SKIP_SSL_VERIFICATION=false # or true GITEA__SSL_CA_CERT=/path/to/cacert.pem ``` 8. Create a webhook in your Gitea project. Set the URL to `http[s]:///api/v1/gitea_webhooks`, the secret token to the generated secret from step 3, and enable the triggers `push`, `comments` and `merge request events`. 9. Test your installation by opening a merge request or commenting on a merge request using one of PR Agent's commands. ================================================ FILE: docs/docs/installation/github.md ================================================ In this page we will cover how to install and run PR-Agent as a GitHub Action or GitHub App, and how to configure it for your needs. ## Run as a GitHub Action You can use our pre-built Github Action Docker image to run PR-Agent as a Github Action. 1) Add the following file to your repository under `.github/workflows/pr_agent.yml`: ```yaml on: pull_request: types: [opened, reopened, ready_for_review] issue_comment: jobs: pr_agent_job: if: ${{ github.event.sender.type != 'Bot' }} runs-on: ubuntu-latest permissions: issues: write pull-requests: write contents: write name: Run pr agent on every pull request, respond to user comments steps: - name: PR Agent action step id: pragent uses: qodo-ai/pr-agent@main env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ``` 2) Add the following secret to your repository under `Settings > Secrets and variables > Actions > New repository secret > Add secret`: ``` Name = OPENAI_KEY Secret = ``` The GITHUB_TOKEN secret is automatically created by GitHub. 3) Merge this change to your main branch. When you open your next PR, you should see a comment from `github-actions` bot with a review of your PR, and instructions on how to use the rest of the tools. 4) You may configure PR-Agent by adding environment variables under the env section corresponding to any configurable property in the [configuration](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) file. Some examples: ```yaml env: # ... previous environment values OPENAI.ORG: "" PR_REVIEWER.REQUIRE_TESTS_REVIEW: "false" # Disable tests review PR_CODE_SUGGESTIONS.NUM_CODE_SUGGESTIONS: 6 # Increase number of code suggestions ``` See detailed usage instructions in the [USAGE GUIDE](../usage-guide/automations_and_usage.md#github-action) ### Configuration Examples This section provides detailed, step-by-step examples for configuring PR-Agent with different models and advanced options in GitHub Actions. #### Quick Start Examples ##### Basic Setup (OpenAI Default) Copy this minimal workflow to get started with the default OpenAI models: ```yaml name: PR Agent on: pull_request: types: [opened, reopened, ready_for_review] issue_comment: jobs: pr_agent_job: if: ${{ github.event.sender.type != 'Bot' }} runs-on: ubuntu-latest permissions: issues: write pull-requests: write contents: write steps: - name: PR Agent action step uses: qodo-ai/pr-agent@main env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ``` ##### Gemini Setup Ready-to-use workflow for Gemini models: ```yaml name: PR Agent (Gemini) on: pull_request: types: [opened, reopened, ready_for_review] issue_comment: jobs: pr_agent_job: if: ${{ github.event.sender.type != 'Bot' }} runs-on: ubuntu-latest permissions: issues: write pull-requests: write contents: write steps: - name: PR Agent action step uses: qodo-ai/pr-agent@main env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} config.model: "gemini/gemini-1.5-flash" config.fallback_models: '["gemini/gemini-1.5-flash"]' GOOGLE_AI_STUDIO.GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` #### Claude Setup Ready-to-use workflow for Claude models: ```yaml name: PR Agent (Claude) on: pull_request: types: [opened, reopened, ready_for_review] issue_comment: jobs: pr_agent_job: if: ${{ github.event.sender.type != 'Bot' }} runs-on: ubuntu-latest permissions: issues: write pull-requests: write contents: write steps: - name: PR Agent action step uses: qodo-ai/pr-agent@main env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} config.model: "anthropic/claude-3-opus-20240229" config.fallback_models: '["anthropic/claude-3-haiku-20240307"]' ANTHROPIC.KEY: ${{ secrets.ANTHROPIC_KEY }} github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` #### Basic Configuration with Tool Controls Start with this enhanced workflow that includes tool configuration: ```yaml on: pull_request: types: [opened, reopened, ready_for_review] issue_comment: jobs: pr_agent_job: if: ${{ github.event.sender.type != 'Bot' }} runs-on: ubuntu-latest permissions: issues: write pull-requests: write contents: write name: Run pr agent on every pull request, respond to user comments steps: - name: PR Agent action step id: pragent uses: qodo-ai/pr-agent@main env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Enable/disable automatic tools github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" # Configure which PR events trigger the action github_action_config.pr_actions: '["opened", "reopened", "ready_for_review", "review_requested"]' ``` #### Switching Models ##### Using Gemini (Google AI Studio) To use Gemini models instead of the default OpenAI models: ```yaml env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Set the model to Gemini config.model: "gemini/gemini-1.5-flash" config.fallback_models: '["gemini/gemini-1.5-flash"]' # Add your Gemini API key GOOGLE_AI_STUDIO.GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} # Tool configuration github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` **Required Secrets:** - Add `GEMINI_API_KEY` to your repository secrets (get it from [Google AI Studio](https://aistudio.google.com/)) **Note:** When using non-OpenAI models like Gemini, you don't need to set `OPENAI_KEY` - only the model-specific API key is required. ##### Using Claude (Anthropic) To use Claude models: ```yaml env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Set the model to Claude config.model: "anthropic/claude-3-opus-20240229" config.fallback_models: '["anthropic/claude-3-haiku-20240307"]' # Add your Anthropic API key ANTHROPIC.KEY: ${{ secrets.ANTHROPIC_KEY }} # Tool configuration github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` **Required Secrets:** - Add `ANTHROPIC_KEY` to your repository secrets (get it from [Anthropic Console](https://console.anthropic.com/)) **Note:** When using non-OpenAI models like Claude, you don't need to set `OPENAI_KEY` - only the model-specific API key is required. ##### Using Azure OpenAI To use Azure OpenAI services: ```yaml env: OPENAI_KEY: ${{ secrets.AZURE_OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Azure OpenAI configuration OPENAI.API_TYPE: "azure" OPENAI.API_VERSION: "2023-05-15" OPENAI.API_BASE: ${{ secrets.AZURE_OPENAI_ENDPOINT }} OPENAI.DEPLOYMENT_ID: ${{ secrets.AZURE_OPENAI_DEPLOYMENT }} # Set the model to match your Azure deployment config.model: "gpt-4o" config.fallback_models: '["gpt-4o"]' # Tool configuration github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` **Required Secrets:** - `AZURE_OPENAI_KEY`: Your Azure OpenAI API key - `AZURE_OPENAI_ENDPOINT`: Your Azure OpenAI endpoint URL - `AZURE_OPENAI_DEPLOYMENT`: Your deployment name ##### Using Local Models (Ollama) To use local models via Ollama: ```yaml env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Set the model to a local Ollama model config.model: "ollama/qwen2.5-coder:32b" config.fallback_models: '["ollama/qwen2.5-coder:32b"]' config.custom_model_max_tokens: "128000" # Ollama configuration OLLAMA.API_BASE: "http://localhost:11434" # Tool configuration github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` **Note:** For local models, you'll need to use a self-hosted runner with Ollama installed, as GitHub Actions hosted runners cannot access localhost services. #### Advanced Configuration Options ##### Custom Review Instructions Add specific instructions for the review process: ```yaml env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Custom review instructions pr_reviewer.extra_instructions: "Focus on security vulnerabilities and performance issues. Check for proper error handling." # Tool configuration github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` ##### Language-Specific Configuration Configure for specific programming languages: ```yaml env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Language-specific settings pr_reviewer.extra_instructions: "Focus on Python best practices, type hints, and docstrings." pr_code_suggestions.num_code_suggestions: "8" pr_code_suggestions.suggestions_score_threshold: "7" # Tool configuration github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` ##### Selective Tool Execution Run only specific tools automatically: ```yaml env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Only run review and describe, skip improve github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "false" # Only trigger on PR open and reopen github_action_config.pr_actions: '["opened", "reopened"]' ``` #### Using Configuration Files Instead of setting all options via environment variables, you can use a `.pr_agent.toml` file in your repository root: 1. Create a `.pr_agent.toml` file in your repository root: ```toml [config] model = "gemini/gemini-1.5-flash" fallback_models = ["anthropic/claude-3-opus-20240229"] [pr_reviewer] extra_instructions = "Focus on security issues and code quality." [pr_code_suggestions] num_code_suggestions = 6 suggestions_score_threshold = 7 ``` 2. Use a simpler workflow file: ```yaml on: pull_request: types: [opened, reopened, ready_for_review] issue_comment: jobs: pr_agent_job: if: ${{ github.event.sender.type != 'Bot' }} runs-on: ubuntu-latest permissions: issues: write pull-requests: write contents: write name: Run pr agent on every pull request, respond to user comments steps: - name: PR Agent action step id: pragent uses: qodo-ai/pr-agent@main env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GOOGLE_AI_STUDIO.GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} ANTHROPIC.KEY: ${{ secrets.ANTHROPIC_KEY }} github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` #### Troubleshooting Common Issues ##### Model Not Found Errors If you get model not found errors: 1. **Check model name format**: Ensure you're using the correct model identifier format (e.g., `gemini/gemini-1.5-flash`, not just `gemini-1.5-flash`) 2. **Verify API keys**: Make sure your API keys are correctly set as repository secrets 3. **Check model availability**: Some models may not be available in all regions or may require specific access ##### Environment Variable Format Remember these key points about environment variables: - Use dots (`.`) or double underscores (`__`) to separate sections and keys - Boolean values should be strings: `"true"` or `"false"` - Arrays should be JSON strings: `'["item1", "item2"]'` - Model names are case-sensitive ##### Rate Limiting If you encounter rate limiting: ```yaml env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Add fallback models for better reliability config.fallback_models: '["gpt-4o", "gpt-3.5-turbo"]' # Increase timeout for slower models config.ai_timeout: "300" github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` ##### Common Error Messages and Solutions **Error: "Model not found"** - **Solution**: Check the model name format and ensure it matches the exact identifier. See the [Changing a model in PR-Agent](../usage-guide/changing_a_model.md) guide for supported models and their correct identifiers. **Error: "API key not found"** - **Solution**: Verify that your API key is correctly set as a repository secret and the environment variable name matches exactly - **Note**: For non-OpenAI models (Gemini, Claude, etc.), you only need the model-specific API key, not `OPENAI_KEY` **Error: "Rate limit exceeded"** - **Solution**: Add fallback models or increase the `config.ai_timeout` value **Error: "Permission denied"** - **Solution**: Ensure your workflow has the correct permissions set: ```yaml permissions: issues: write pull-requests: write contents: write ``` **Error: "Invalid JSON format"** - **Solution**: Check that arrays are properly formatted as JSON strings: ```yaml Correct: config.fallback_models: '["model1", "model2"]' Incorrect (interpreted as a YAML list, not a string): config.fallback_models: ["model1", "model2"] ``` ##### Debugging Tips 1. **Enable verbose logging**: Add `config.verbosity_level: "2"` to see detailed logs 2. **Check GitHub Actions logs**: Look at the step output for specific error messages 3. **Test with minimal configuration**: Start with just the basic setup and add options one by one 4. **Verify secrets**: Double-check that all required secrets are set in your repository settings ##### Performance Optimization For better performance with large repositories: ```yaml env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Optimize for large PRs config.large_patch_policy: "clip" config.max_model_tokens: "32000" config.patch_extra_lines_before: "3" config.patch_extra_lines_after: "1" github_action_config.auto_review: "true" github_action_config.auto_describe: "true" github_action_config.auto_improve: "true" ``` #### Reference For more detailed configuration options, see: - [Changing a model in PR-Agent](../usage-guide/changing_a_model.md) - [Configuration options](../usage-guide/configuration_options.md) - [Automations and usage](../usage-guide/automations_and_usage.md#github-action) ### Using a specific release !!! tip "" if you want to pin your action to a specific release (v0.23 for example) for stability reasons, use: ```yaml ... steps: - name: PR Agent action step id: pragent uses: docker://codiumai/pr-agent:0.23-github_action ... ``` For enhanced security, you can also specify the Docker image by its [digest](https://hub.docker.com/repository/docker/codiumai/pr-agent/tags): ```yaml ... steps: - name: PR Agent action step id: pragent uses: docker://codiumai/pr-agent@sha256:14165e525678ace7d9b51cda8652c2d74abb4e1d76b57c4a6ccaeba84663cc64 ... ``` ### Action for GitHub enterprise server !!! tip "" To use the action with a GitHub enterprise server, add an environment variable `GITHUB.BASE_URL` with the API URL of your GitHub server. For example, if your GitHub server is at `https://github.mycompany.com`, add the following to your workflow file: ```yaml env: # ... previous environment values GITHUB.BASE_URL: "https://github.mycompany.com/api/v3" ``` --- ## Run as a GitHub App Allowing you to automate the review process on your private or public repositories. 1) Create a GitHub App from the [Github Developer Portal](https://docs.github.com/en/developers/apps/creating-a-github-app). - Set the following permissions: - Pull requests: Read & write - Issue comment: Read & write - Metadata: Read-only - Contents: Read-only - Set the following events: - Issue comment - Pull request - Push (if you need to enable triggering on PR update) 2) Generate a random secret for your app, and save it for later. For example, you can use: ```bash WEBHOOK_SECRET=$(python -c "import secrets; print(secrets.token_hex(10))") ``` 3) Acquire the following pieces of information from your app's settings page: - App private key (click "Generate a private key" and save the file) - App ID 4) Clone this repository: ```bash git clone https://github.com/qodo-ai/pr-agent.git ``` 5) Copy the secrets template file and fill in the following: ```bash cp pr_agent/settings/.secrets_template.toml pr_agent/settings/.secrets.toml # Edit .secrets.toml file ``` - Your OpenAI key. - Copy your app's private key to the private_key field. - Copy your app's ID to the app_id field. - Copy your app's webhook secret to the webhook_secret field. - Set deployment_type to 'app' in [configuration.toml](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) > The .secrets.toml file is not copied to the Docker image by default, and is only used for local development. > If you want to use the .secrets.toml file in your Docker image, you can add remove it from the .dockerignore file. > In most production environments, you would inject the secrets file as environment variables or as mounted volumes. > For example, in order to inject a secrets file as a volume in a Kubernetes environment you can update your pod spec to include the following, > assuming you have a secret named `pr-agent-settings` with a key named `.secrets.toml`: ``` volumes: - name: settings-volume secret: secretName: pr-agent-settings // ... containers: // ... volumeMounts: - mountPath: /app/pr_agent/settings_prod name: settings-volume ``` > Another option is to set the secrets as environment variables in your deployment environment, for example `OPENAI.KEY` and `GITHUB.USER_TOKEN`. 6) Build a Docker image for the app and optionally push it to a Docker repository. We'll use Dockerhub as an example: ```bash docker build . -t codiumai/pr-agent:github_app --target github_app -f docker/Dockerfile docker push codiumai/pr-agent:github_app # Push to your Docker repository ``` 7. Host the app using a server, serverless function, or container environment. Alternatively, for development and debugging, you may use tools like smee.io to forward webhooks to your local machine. You can check [Deploy as a Lambda Function](#deploy-as-a-lambda-function) 8. Go back to your app's settings, and set the following: - Webhook URL: The URL of your app's server or the URL of the smee.io channel. - Webhook secret: The secret you generated earlier. 9. Install the app by navigating to the "Install App" tab and selecting your desired repositories. > **Note:** When running PR-Agent from GitHub app, the default configuration file (configuration.toml) will be loaded. > However, you can override the default tool parameters by uploading a local configuration file `.pr_agent.toml` > For more information please check out the [USAGE GUIDE](../usage-guide/automations_and_usage.md#github-app) --- ## Additional deployment methods ### Deploy as a Lambda Function Note that since AWS Lambda env vars cannot have "." in the name, you can replace each "." in an env variable with "__".
For example: `GITHUB.WEBHOOK_SECRET` --> `GITHUB__WEBHOOK_SECRET` 1. Follow steps 1-5 from [here](#run-as-a-github-app). 2. Build a docker image that can be used as a lambda function ```shell docker buildx build --platform=linux/amd64 . -t codiumai/pr-agent:github_lambda --target github_lambda -f docker/Dockerfile.lambda ``` (Note: --target github_lambda is optional as it's the default target) 3. Push image to ECR ```shell docker tag codiumai/pr-agent:github_lambda .dkr.ecr..amazonaws.com/codiumai/pr-agent:github_lambda docker push .dkr.ecr..amazonaws.com/codiumai/pr-agent:github_lambda ``` 4. Create a lambda function that uses the uploaded image. Set the lambda timeout to be at least 3m. 5. Configure the lambda function to have a Function URL. 6. In the environment variables of the Lambda function, specify `AZURE_DEVOPS_CACHE_DIR` to a writable location such as /tmp. (see [link](https://github.com/qodo-ai/pr-agent/pull/450#issuecomment-1840242269)) 7. Go back to steps 8-9 of [Method 5](#run-as-a-github-app) with the function url as your Webhook URL. The Webhook URL would look like `https:///api/v1/github_webhooks` #### Using AWS Secrets Manager For production Lambda deployments, use AWS Secrets Manager instead of environment variables: 1. Create a secret in AWS Secrets Manager with JSON format like this: ```json { "openai.key": "sk-proj-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "github.webhook_secret": "your-webhook-secret-from-step-2", "github.private_key": "-----BEGIN RSA PRIVATE KEY-----\nMIIEpAIBAAKCAQEA...\n-----END RSA PRIVATE KEY-----" } ``` 2. Add IAM permission `secretsmanager:GetSecretValue` to your Lambda execution role 3. Set these environment variables in your Lambda: ```bash AWS_SECRETS_MANAGER__SECRET_ARN=arn:aws:secretsmanager:us-east-1:123456789012:secret:pr-agent-secrets-AbCdEf CONFIG__SECRET_PROVIDER=aws_secrets_manager ``` --- ### AWS CodeCommit Setup Not all features have been added to CodeCommit yet. As of right now, CodeCommit has been implemented to run the PR-Agent CLI on the command line, using AWS credentials stored in environment variables. (More features will be added in the future.) The following is a set of instructions to have PR-Agent do a review of your CodeCommit pull request from the command line: 1. Create an IAM user that you will use to read CodeCommit pull requests and post comments - Note: That user should have CLI access only, not Console access 2. Add IAM permissions to that user, to allow access to CodeCommit (see IAM Role example below) 3. Generate an Access Key for your IAM user 4. Set the Access Key and Secret using environment variables (see Access Key example below) 5. Set the `git_provider` value to `codecommit` in the `pr_agent/settings/configuration.toml` settings file 6. Set the `PYTHONPATH` to include your `pr-agent` project directory - Option A: Add `PYTHONPATH="/PATH/TO/PROJECTS/pr-agent` to your `.env` file - Option B: Set `PYTHONPATH` and run the CLI in one command, for example: - `PYTHONPATH="/PATH/TO/PROJECTS/pr-agent python pr_agent/cli.py [--ARGS]` --- ##### AWS CodeCommit IAM Role Example Example IAM permissions to that user to allow access to CodeCommit: - Note: The following is a working example of IAM permissions that has read access to the repositories and write access to allow posting comments - Note: If you only want pr-agent to review your pull requests, you can tighten the IAM permissions further, however this IAM example will work, and allow the pr-agent to post comments to the PR - Note: You may want to replace the `"Resource": "*"` with your list of repos, to limit access to only those repos ```json { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "codecommit:BatchDescribe*", "codecommit:BatchGet*", "codecommit:Describe*", "codecommit:EvaluatePullRequestApprovalRules", "codecommit:Get*", "codecommit:List*", "codecommit:PostComment*", "codecommit:PutCommentReaction", "codecommit:UpdatePullRequestDescription", "codecommit:UpdatePullRequestTitle" ], "Resource": "*" } ] } ``` ##### AWS CodeCommit Access Key and Secret Example setting the Access Key and Secret using environment variables ```sh export AWS_ACCESS_KEY_ID="XXXXXXXXXXXXXXXX" export AWS_SECRET_ACCESS_KEY="XXXXXXXXXXXXXXXX" export AWS_DEFAULT_REGION="us-east-1" ``` ##### AWS CodeCommit CLI Example After you set up AWS CodeCommit using the instructions above, here is an example CLI run that tells pr-agent to **review** a given pull request. (Replace your specific PYTHONPATH and PR URL in the example) ```sh PYTHONPATH="/PATH/TO/PROJECTS/pr-agent" python pr_agent/cli.py \ --pr_url https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/MY_REPO_NAME/pull-requests/321 \ review ``` ================================================ FILE: docs/docs/installation/gitlab.md ================================================ ## Run as a GitLab Pipeline You can use a pre-built Action Docker image to run PR-Agent as a GitLab pipeline. This is a simple way to get started with PR-Agent without setting up your own server. (1) Add the following file to your repository under `.gitlab-ci.yml`: ```yaml stages: - pr_agent pr_agent_job: stage: pr_agent image: name: codiumai/pr-agent:latest entrypoint: [""] script: - cd /app - echo "Running PR Agent action step" - export MR_URL="$CI_MERGE_REQUEST_PROJECT_URL/merge_requests/$CI_MERGE_REQUEST_IID" - echo "MR_URL=$MR_URL" - export gitlab__url=$CI_SERVER_PROTOCOL://$CI_SERVER_FQDN - export gitlab__PERSONAL_ACCESS_TOKEN=$GITLAB_PERSONAL_ACCESS_TOKEN - export config__git_provider="gitlab" - export openai__key=$OPENAI_KEY - python -m pr_agent.cli --pr_url="$MR_URL" describe - python -m pr_agent.cli --pr_url="$MR_URL" review - python -m pr_agent.cli --pr_url="$MR_URL" improve rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' ``` This script will run PR-Agent on every new merge request. You can modify the `rules` section to run PR-Agent on different events. You can also modify the `script` section to run different PR-Agent commands, or with different parameters by exporting different environment variables. (2) Add the following masked variables to your GitLab repository (CI/CD -> Variables): - `GITLAB_PERSONAL_ACCESS_TOKEN`: Your GitLab personal access token. - `OPENAI_KEY`: Your OpenAI key. Note that if your base branches are not protected, don't set the variables as `protected`, since the pipeline will not have access to them. > **Note**: The `$CI_SERVER_FQDN` variable is available starting from GitLab version 16.10. If you're using an earlier version, this variable will not be available. However, you can combine `$CI_SERVER_HOST` and `$CI_SERVER_PORT` to achieve the same result. Please ensure you're using a compatible version or adjust your configuration. > **Note**: The `gitlab__SSL_VERIFY` environment variable can be used to specify the path to a custom CA certificate bundle for SSL verification. GitLab exposes the `$CI_SERVER_TLS_CA_FILE` variable, which points to the custom CA certificate file configured in your GitLab instance. > Alternatively, SSL verification can be disabled entirely by setting `gitlab__SSL_VERIFY=false`, although this is not recommended. ## Run a GitLab webhook server 1. In GitLab create a new user and give it "Reporter" role for the intended group or project. 2. For the user from step 1, generate a `personal_access_token` with `api` access. 3. Generate a random secret for your app, and save it for later (`shared_secret`). For example, you can use: ```bash SHARED_SECRET=$(python -c "import secrets; print(secrets.token_hex(10))") ``` 4. Clone this repository: ```bash git clone https://github.com/qodo-ai/pr-agent.git ``` 5. Prepare variables and secrets. Skip this step if you plan on setting these as environment variables when running the agent: 1. In the configuration file/variables: - Set `config.git_provider` to "gitlab" 2. In the secrets file/variables: - Set your AI model key in the respective section - In the [gitlab] section, set `personal_access_token` (with token from step 2) and `shared_secret` (with secret from step 3) - **Authentication type**: Set `auth_type` to `"private_token"` for older GitLab versions (e.g., 11.x) or private deployments. Default is `"oauth_token"` for gitlab.com and newer versions. 6. Build a Docker image for the app and optionally push it to a Docker repository. We'll use Dockerhub as an example: ```bash docker build . -t gitlab_pr_agent --target gitlab_webhook -f docker/Dockerfile docker push codiumai/pr-agent:gitlab_webhook # Push to your Docker repository ``` 7. Set the environmental variables, the method depends on your docker runtime. Skip this step if you included your secrets/configuration directly in the Docker image. ```bash CONFIG__GIT_PROVIDER=gitlab GITLAB__PERSONAL_ACCESS_TOKEN= GITLAB__SHARED_SECRET= GITLAB__URL=https://gitlab.com GITLAB__AUTH_TYPE=oauth_token # Use "private_token" for older GitLab versions OPENAI__KEY= PORT=3000 # Optional: override the webhook server port ``` 8. Create a webhook in your GitLab project. Set the URL to `http[s]:///webhook`, the secret token to the generated secret from step 3, and enable the triggers `push`, `comments` and `merge request events`. 9. Test your installation by opening a merge request or commenting on a merge request using one of PR Agent's commands. ## Deploy as a Lambda Function Note that since AWS Lambda env vars cannot have "." in the name, you can replace each "." in an env variable with "__".
For example: `GITLAB.PERSONAL_ACCESS_TOKEN` --> `GITLAB__PERSONAL_ACCESS_TOKEN` 1. Follow steps 1-5 from [Run a GitLab webhook server](#run-a-gitlab-webhook-server). 2. Build a docker image that can be used as a lambda function ```shell docker buildx build --platform=linux/amd64 . -t codiumai/pr-agent:gitlab_lambda --target gitlab_lambda -f docker/Dockerfile.lambda ``` 3. Push image to ECR ```shell docker tag codiumai/pr-agent:gitlab_lambda .dkr.ecr..amazonaws.com/codiumai/pr-agent:gitlab_lambda docker push .dkr.ecr..amazonaws.com/codiumai/pr-agent:gitlab_lambda ``` 4. Create a lambda function that uses the uploaded image. Set the lambda timeout to be at least 3m. 5. Configure the lambda function to have a Function URL. 6. In the environment variables of the Lambda function, specify `AZURE_DEVOPS_CACHE_DIR` to a writable location such as /tmp. (see [link](https://github.com/qodo-ai/pr-agent/pull/450#issuecomment-1840242269)) 7. Go back to steps 8-9 of [Run a GitLab webhook server](#run-a-gitlab-webhook-server) with the function URL as your Webhook URL. The Webhook URL would look like `https:///webhook` ### Using AWS Secrets Manager For production Lambda deployments, use AWS Secrets Manager instead of environment variables: 1. Create individual secrets for each GitLab webhook with this JSON format (e.g., secret name: `project-webhook-secret-001`) ```json { "gitlab_token": "glpat-xxxxxxxxxxxxxxxxxxxxxxxx", "token_name": "project-webhook-001" } ``` 2. Create a main configuration secret for common settings (e.g., secret name: `pr-agent-main-config`) ```json { "openai.key": "sk-proj-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" } ``` 3. Set these environment variables in your Lambda: ```bash CONFIG__SECRET_PROVIDER=aws_secrets_manager AWS_SECRETS_MANAGER__SECRET_ARN=arn:aws:secretsmanager:us-east-1:123456789012:secret:pr-agent-main-config-AbCdEf ``` 4. In your GitLab webhook configuration, set the **Secret Token** to the **Secret name** created in step 1: - Example: `project-webhook-secret-001` **Important**: When using Secrets Manager, GitLab's webhook secret must be the Secrets Manager secret name. 5. Add IAM permission `secretsmanager:GetSecretValue` to your Lambda execution role ================================================ FILE: docs/docs/installation/index.md ================================================ # Installation There are several ways to use PR-Agent: - [Locally](./locally.md) - [GitHub integration](./github.md) - [GitLab integration](./gitlab.md) - [BitBucket integration](./bitbucket.md) - [Azure DevOps integration](./azure.md) - [Gitea integration](./gitea.md) ================================================ FILE: docs/docs/installation/locally.md ================================================ To run PR-Agent locally, you first need to acquire two keys: 1. An OpenAI key from [here](https://platform.openai.com/api-keys){:target="_blank"}, with access to GPT-4 and o4-mini (or a key for other [language models](../usage-guide/changing_a_model.md), if you prefer). 2. A personal access token from your Git platform (GitHub, GitLab, BitBucket,Gitea) with repo scope. GitHub token, for example, can be issued from [here](https://github.com/settings/tokens){:target="_blank"} ## Using Docker image A list of the relevant tools can be found in the [tools guide](../tools/). To invoke a tool (for example `review`), you can run PR-Agent directly from the Docker image. Here's how: - For GitHub: ```bash docker run --rm -it -e OPENAI.KEY= -e GITHUB.USER_TOKEN= codiumai/pr-agent:latest --pr_url review ``` If you are using GitHub enterprise server, you need to specify the custom url as variable. For example, if your GitHub server is at `https://github.mycompany.com`, add the following to the command: ```bash -e GITHUB.BASE_URL=https://github.mycompany.com/api/v3 ``` - For GitLab: ```bash docker run --rm -it -e OPENAI.KEY= -e CONFIG.GIT_PROVIDER=gitlab -e GITLAB.PERSONAL_ACCESS_TOKEN= codiumai/pr-agent:latest --pr_url review ``` If you have a dedicated GitLab instance, you need to specify the custom url as variable: ```bash -e GITLAB.URL= ``` - For BitBucket: ```bash docker run --rm -it -e CONFIG.GIT_PROVIDER=bitbucket -e OPENAI.KEY=$OPENAI_API_KEY -e BITBUCKET.BEARER_TOKEN=$BITBUCKET_BEARER_TOKEN codiumai/pr-agent:latest --pr_url= review ``` - For Gitea: ```bash docker run --rm -it -e OPENAI.KEY= -e CONFIG.GIT_PROVIDER=gitea -e GITEA.PERSONAL_ACCESS_TOKEN= codiumai/pr-agent:latest --pr_url review ``` If you have a dedicated Gitea instance, you need to specify the custom url as variable: ```bash -e GITEA.URL= ``` For other git providers, update `CONFIG.GIT_PROVIDER` accordingly and check the [`pr_agent/settings/.secrets_template.toml`](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/.secrets_template.toml) file for environment variables expected names and values. ### Utilizing environment variables It is also possible to provide or override the configuration by setting the corresponding environment variables. You can define the corresponding environment variables by following this convention: `__=` or `
.=`. The `
` refers to a table/section in a configuration file and `=` refers to the key/value pair of a setting in the configuration file. For example, suppose you want to run `pr_agent` that connects to a self-hosted GitLab instance similar to an example above. You can define the environment variables in a plain text file named `.env` with the following content: ```bash CONFIG__GIT_PROVIDER="gitlab" GITLAB__URL="" GITLAB__PERSONAL_ACCESS_TOKEN="" OPENAI__KEY="" ``` Then, you can run `pr_agent` using Docker with the following command: ```shell docker run --rm -it --env-file .env codiumai/pr-agent:latest ``` --- ### I get an error when running the Docker image. What should I do? If you encounter an error when running the Docker image, it is almost always due to a misconfiguration of api keys or tokens. Note that litellm, which is used by pr-agent, sometimes returns non-informative error messages such as `APIError: OpenAIException - Connection error.` Carefully check the api keys and tokens you provided and make sure they are correct. Adjustments may be needed depending on your llm provider. For example, for Azure OpenAI, additional keys are [needed](../usage-guide/changing_a_model.md#azure). Same goes for other providers, make sure to check the [documentation](../usage-guide/changing_a_model.md#changing-a-model) ## Using pip package Install the package: ```bash pip install pr-agent ``` Then run the relevant tool with the script below.
Make sure to fill in the required parameters (`user_token`, `openai_key`, `pr_url`, `command`): ```python from pr_agent import cli from pr_agent.config_loader import get_settings def main(): # Fill in the following values provider = "github" # github/gitlab/bitbucket/azure_devops user_token = "..." # user token openai_key = "..." # OpenAI key pr_url = "..." # PR URL, for example 'https://github.com/qodo-ai/pr-agent/pull/809' command = "/review" # Command to run (e.g. '/review', '/describe', '/ask="What is the purpose of this PR?"', ...) # Setting the configurations get_settings().set("CONFIG.git_provider", provider) get_settings().set("openai.key", openai_key) get_settings().set("github.user_token", user_token) # Run the command. Feedback will appear in GitHub PR comments cli.run_command(pr_url, command) if __name__ == '__main__': main() ``` ## Run from source 1. Clone this repository: ```bash git clone https://github.com/qodo-ai/pr-agent.git ``` 2. Navigate to the `/pr-agent` folder and install the requirements in your favorite virtual environment: ```bash pip install -e . ``` *Note: If you get an error related to Rust in the dependency installation then make sure Rust is installed and in your `PATH`, instructions: https://rustup.rs* 3. Copy the secrets template file and fill in your OpenAI key and your GitHub user token: ```bash cp pr_agent/settings/.secrets_template.toml pr_agent/settings/.secrets.toml chmod 600 pr_agent/settings/.secrets.toml # Edit .secrets.toml file ``` 4. Run the cli.py script: ```bash python3 -m pr_agent.cli --pr_url review python3 -m pr_agent.cli --pr_url ask python3 -m pr_agent.cli --pr_url describe python3 -m pr_agent.cli --pr_url improve python3 -m pr_agent.cli --pr_url add_docs python3 -m pr_agent.cli --pr_url generate_labels python3 -m pr_agent.cli --issue_url similar_issue ... ``` [Optional] Add the pr_agent folder to your PYTHONPATH ```bash export PYTHONPATH=$PYTHONPATH: ``` ================================================ FILE: docs/docs/installation/pr_agent.md ================================================ # PR-Agent Installation Guide PR-Agent can be deployed in various environments and platforms. Choose the installation method that best suits your needs: ## 🖥️ Local Installation Learn how to run PR-Agent locally using: - Docker image - pip package - CLI from source code [View Local Installation Guide →](./locally.md) ## 🐙 GitHub Integration Set up PR-Agent with GitHub as: - GitHub Action - Local GitHub App [View GitHub Integration Guide →](./github.md) ## 🦊 GitLab Integration Deploy PR-Agent on GitLab as: - GitLab pipeline job - Local GitLab webhook server [View GitLab Integration Guide →](./gitlab.md) ## 🟦 BitBucket Integration Implement PR-Agent in BitBucket as: - BitBucket pipeline job - Local BitBucket server [View BitBucket Integration Guide →](./bitbucket.md) ## 🔷 Azure DevOps Integration Configure PR-Agent with Azure DevOps as: - Azure DevOps pipeline job - Local Azure DevOps webhook [View Azure DevOps Integration Guide →](./azure.md) ================================================ FILE: docs/docs/overview/data_privacy.md ================================================ ## Self-hosted PR-Agent - If you self-host PR-Agent with your OpenAI (or other LLM provider) API key, it is between you and the provider. ================================================ FILE: docs/docs/summary.md ================================================ # Table of contents * [Overview](index.md) * [Data Privacy](overview/data_privacy.md) ## Installation * [Installation](installation/index.md) * [PR-Agent](installation/pr_agent.md) ## Usage Guide * [Usage Guide](usage-guide/index.md) * [Introduction](usage-guide/introduction.md) * [Configuration File](usage-guide/configuration_options.md) * [Usage and Automation](usage-guide/automations_and_usage.md) * [Managing Mail Notifications](usage-guide/mail_notifications.md) * [Changing a Model](usage-guide/changing_a_model.md) * [Additional Configurations](usage-guide/additional_configurations.md) * [Frequently Asked Questions](faq/index.md) ## Tools * [Tools](tools/index.md) * [Describe](tools/describe.md) * [Review](tools/review.md) * [Improve](tools/improve.md) * [Ask](tools/ask.md) * [Add Docs](tools/add_docs.md) * [Generate Labels](tools/generate_labels.md) * [Similar Issues](tools/similar_issues.md) * [Help](tools/help.md) * [Help Docs](tools/help_docs.md) * [Update Changelog](tools/update_changelog.md) ## Core Abilities * [Core Abilities](core-abilities/index.md) * [Chat on code suggestions](core-abilities/interactivity.md) * [Compression strategy](core-abilities/compression_strategy.md) * [Dynamic context](core-abilities/dynamic_context.md) * [Fetching ticket context](core-abilities/fetching_ticket_context.md) * [Interactivity](core-abilities/interactivity.md) * [Local and global metadata](core-abilities/metadata.md) * [Self-reflection](core-abilities/self_reflection.md) ================================================ FILE: docs/docs/tools/add_docs.md ================================================ ## Overview The `add_docs` tool scans the PR code changes and suggests documentation for any code components that are missing documentation, such as functions, classes, and methods. It can be invoked manually by commenting on any PR: ``` /add_docs ``` ## Example usage Invoke the tool manually by commenting `/add_docs` on any PR: ![Add Docs](https://codium.ai/images/pr_agent/add_docs_comment.png){width=512} The tool will generate documentation suggestions as inline code suggestions: ![Add Docs Result](https://codium.ai/images/pr_agent/add_docs_result.png){width=512} ### Language-specific documentation styles The tool automatically detects the programming language and generates documentation in the appropriate format: | Language | Documentation Format | |----------|---------------------| | Python | Docstrings (Sphinx, Google, Numpy styles) | | Java | Javadocs | | JavaScript/TypeScript | JSdocs | | C++ | Doxygen | | Other | Generic documentation | ## Configuration options Under the section `[pr_add_docs]`, the following options are available: | Option | Type | Default | Description | |--------|------|---------|-------------| | `extra_instructions` | string | `""` | Additional instructions for the AI model | | `docs_style` | string | `"Sphinx"` | Documentation style for Python. Options: `"Sphinx"`, `"Google Style with Args, Returns, Attributes...etc"`, `"Numpy Style"`, `"PEP257"`, `"reStructuredText"` | | `file` | string | `""` | Specific file to document (useful when multiple components have the same name) | | `class_name` | string | `""` | Specific class name to target (useful when methods have the same name in the same file) | ### Example configuration To customize the documentation style, add the following to your configuration file: ```toml [pr_add_docs] docs_style = "Google Style with Args, Returns, Attributes...etc" extra_instructions = "Focus on documenting public methods and include usage examples" ``` ### Command line options You can pass configuration options directly in the command: ``` /add_docs --pr_add_docs.docs_style="Numpy Style" ``` ## How it works 1. The tool analyzes the PR diff to identify code components (functions, classes, methods) that lack documentation 2. It uses AI to generate appropriate documentation based on the code context and language 3. Documentation suggestions are published as inline code suggestions that can be applied with a single click ================================================ FILE: docs/docs/tools/ask.md ================================================ ## Overview The `ask` tool answers questions about the PR, based on the PR code changes. Make sure to be specific and clear in your questions. It can be invoked manually by commenting on any PR: ``` /ask "..." ``` ## Example usage ![Ask Comment](https://codium.ai/images/pr_agent/ask_comment.png){width=512} ![Ask](https://codium.ai/images/pr_agent/ask.png){width=512} ## Ask lines You can run `/ask` on specific lines of code in the PR from the PR's diff view. The tool will answer questions based on the code changes in the selected lines. - Click on the '+' sign next to the line number to select the line. - To select multiple lines, click on the '+' sign of the first line and then hold and drag to select the rest of the lines. - write `/ask "..."` in the comment box and press `Add single comment` button. ![Ask Line](https://codium.ai/images/pr_agent/Ask_line.png){width=512} Note that the tool does not have "memory" of previous questions, and answers each question independently. ## Ask on images You can also ask questions about images that appear in the comment, where the entire PR code will be used as context.
The basic syntax is: ``` /ask "..." [Image](https://real_link_to_image) ``` where `https://real_link_to_image` is the direct link to the image. Note that GitHub has a built-in mechanism of pasting images in comments. However, pasted image does not provide a direct link. To get a direct link to an image, we recommend using the following scheme: 1\. First, post a comment that contains **only** the image: ![Ask image1](https://codium.ai/images/pr_agent/ask_images1.png){width=512} 2\. Quote reply to that comment: ![Ask image2](https://codium.ai/images/pr_agent/ask_images2.png){width=512} 3\. In the screen opened, type the question below the image: ![Ask image3](https://codium.ai/images/pr_agent/ask_images3.png){width=512} ![Ask image4](https://codium.ai/images/pr_agent/ask_images4.png){width=512} 4\. Post the comment, and receive the answer: ![Ask image5](https://codium.ai/images/pr_agent/ask_images5.png){width=512} See a full video tutorial [here](https://codium.ai/images/pr_agent/ask_image_video.mov) ================================================ FILE: docs/docs/tools/describe.md ================================================ ## Overview The `describe` tool scans the PR code changes, and generates a description for the PR - title, type, summary, walkthrough and labels. The tool can be triggered automatically every time a new PR is [opened](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened), or it can be invoked manually by commenting on any PR: ``` /describe ``` ## Example usage ### Manual triggering Invoke the tool manually by commenting `/describe` on any PR: ![Describe comment](https://codium.ai/images/pr_agent/describe_comment.png){width=512} After ~30 seconds, the tool will generate a description for the PR: ![Describe New](https://codium.ai/images/pr_agent/describe_new.png){width=512} If you want to edit [configurations](#configuration-options), add the relevant ones to the command: ``` /describe --pr_description.some_config1=... --pr_description.some_config2=... ``` ### Automatic triggering To run the `describe` automatically when a PR is opened, define in a [configuration file](../usage-guide/configuration_options.md#wiki-configuration-file): ``` [github_app] pr_commands = [ "/describe", ... ] [pr_description] publish_labels = true ... ``` - The `pr_commands` lists commands that will be executed automatically when a PR is opened. - The `[pr_description]` section contains the configurations for the `describe` tool you want to edit (if any). ## Preserving the original user description By default, PR-Agent tries to preserve your original PR description by placing it above the generated content. This requires including your description during the initial PR creation. "PR-Agent removed the original description from the PR. Why"? From our experience, there are two possible reasons: - If you edit the description _while_ the automated tool is running, a race condition may occur, potentially causing your original description to be lost. Hence, create a description before launching the PR. - When _updating_ PR descriptions, the `/describe` tool considers everything above the "PR Type" field as user content and will preserve it. Everything below this marker is treated as previously auto-generated content and will be replaced. ![Describe comment](https://codium.ai/images/pr_agent/pr_description_user_description.png){width=512} ## Sequence Diagram Support The `/describe` tool includes a Mermaid sequence diagram showing component/function interactions. This option is enabled by default via the `pr_description.enable_pr_diagram` param. [//]: # (### How to enable\disable) [//]: # () [//]: # (In your configuration:) [//]: # () [//]: # (```) [//]: # (toml) [//]: # ([pr_description]) [//]: # (enable_pr_diagram = true) [//]: # (```) ## Configuration options ???+ example "Possible configurations"
publish_labels If set to true, the tool will publish labels to the PR. Default is false.
publish_description_as_comment If set to true, the tool will publish the description as a comment to the PR. If false, it will overwrite the original description. Default is false.
publish_description_as_comment_persistent If set to true and `publish_description_as_comment` is true, the tool will publish the description as a persistent comment to the PR. Default is true.
add_original_user_description If set to true, the tool will add the original user description to the generated description. Default is true.
generate_ai_title If set to true, the tool will also generate an AI title for the PR. Default is false.
extra_instructions Optional extra instructions to the tool. For example: "focus on the changes in the file X. Ignore change in ..."
enable_pr_type If set to false, it will not show the `PR type` as a text value in the description content. Default is true.
final_update_message If set to true, it will add a comment message [`PR Description updated to latest commit...`](https://github.com/qodo-ai/pr-agent/pull/499#issuecomment-1837412176) after finishing calling `/describe`. Default is true.
enable_semantic_files_types If set to true, "Changes walkthrough" section will be generated. Default is true.
file_table_collapsible_open_by_default If set to true, the file list in the "Changes walkthrough" section will be open by default. If set to false, it will be closed by default. Default is false.
collapsible_file_list If set to true, the file list in the "Changes walkthrough" section will be collapsible. If set to "adaptive", the file list will be collapsible only if there are more than 8 files. Default is "adaptive".
enable_large_pr_handling If set to true, in case of a large PR the tool will make several calls to the AI and combine them to be able to cover more files. Default is true.
enable_help_text If set to true, the tool will display a help text in the comment. Default is false.
enable_pr_diagram If set to true, the tool will generate a horizontal Mermaid flowchart summarizing the main pull request changes. This field remains empty if not applicable. Default is true.
auto_create_ticket If set to true, this will automatically create a ticket in the ticketing system when a PR is opened. Default is false.
## Markers template To enable markers, set `pr_description.use_description_markers=true`. Markers enable to easily integrate user's content and auto-generated content, with a template-like mechanism. For example, if the PR original description was: ``` User content... ## PR Type: pr_agent:type ## PR Description: pr_agent:summary ## PR Walkthrough: pr_agent:walkthrough ## PR Diagram: pr_agent:diagram ``` The marker `pr_agent:type` will be replaced with the PR type, `pr_agent:summary` will be replaced with the PR summary, `pr_agent:walkthrough` will be replaced with the PR walkthrough, and `pr_agent:diagram` will be replaced with the sequence diagram (if enabled). ![Describe markers before](https://codium.ai/images/pr_agent/describe_markers_before.png){width=512} becomes ![Describe markers after](https://codium.ai/images/pr_agent/describe_markers_after.png){width=512} **Configuration params**: - `use_description_markers`: if set to true, the tool will use markers template. It replaces every marker of the form `pr_agent:marker_name` with the relevant content. Default is false. - `include_generated_by_header`: if set to true, the tool will add a dedicated header: 'Generated by PR Agent at ...' to any automatic content. Default is true. - `diagram`: if present as a marker, will be replaced by the PR sequence diagram (if enabled). ## Custom labels The default labels of the describe tool are quite generic, since they are meant to be used in any repo: [`Bug fix`, `Tests`, `Enhancement`, `Documentation`, `Other`]. You can define custom labels that are relevant for your repo and use cases. Custom labels can be defined in a configuration file, or directly in the repo's [labels page](#handle-custom-labels-from-the-repos-labels-page). Make sure to provide proper title, and a detailed and well-phrased description for each label, so the tool will know when to suggest it. Each label description should be a **conditional statement**, that indicates if to add the label to the PR or not, according to the PR content. ???+ tip "Auto-remove custom label when no longer relevant" If the custom label is no longer relevant, it will be automatically removed from the PR by running the `generate_labels` tool or the `describe` tool. ### Handle custom labels from a configuration file Example for a custom labels configuration setup in a configuration file: ``` [config] enable_custom_labels=true [custom_labels."sql_changes"] description = "Use when a PR contains changes to SQL queries" [custom_labels."test"] description = "use when a PR primarily contains new tests" ... ``` ### Handle custom labels from the Repo's labels page You can also control the custom labels that will be suggested by the `describe` tool from the repo's labels page: - GitHub : go to `https://github.com/{owner}/{repo}/labels` (or click on the "Labels" tab in the issues or PRs page) - GitLab : go to `https://gitlab.com/{owner}/{repo}/-/labels` (or click on "Manage" -> "Labels" on the left menu) Now add/edit the custom labels. they should be formatted as follows: - Label name: The name of the custom label. - Description: Start the description of with prefix `pr_agent:`, for example: `pr_agent: Description of when AI should suggest this label`.
Examples for custom labels: - `Main topic:performance` - pr_agent:The main topic of this PR is performance - `New endpoint` - pr_agent:A new endpoint was added in this PR - `SQL query` - pr_agent:A new SQL query was added in this PR - `Dockerfile changes` - pr_agent:The PR contains changes in the Dockerfile - ... The description should be comprehensive and detailed, indicating when to add the desired label. For example: ![Add native custom labels](https://codium.ai/images/pr_agent/add_native_custom_labels.png){width=768} ## Usage Tips !!! tip "Automation" - When you first install PR-Agent app, the [default mode](../usage-guide/automations_and_usage.md#github-app) for the describe tool is: ``` pr_commands = ["/describe", ...] ``` meaning the `describe` tool will run automatically on every PR, with the default configurations. - Markers are an alternative way to control the generated description, to give maximal control to the user. If you set: ``` pr_commands = ["/describe --pr_description.use_description_markers=true", ...] ``` the tool will replace every marker of the form `pr_agent:marker_name` in the PR description with the relevant content, where `marker_name` is one of the following: *`type`: the PR type. * `summary`: the PR summary. * `walkthrough`: the PR walkthrough. - Note that when markers are enabled, if the original PR description does not contain any markers, the tool will not alter the description at all. ================================================ FILE: docs/docs/tools/generate_labels.md ================================================ ## Overview The `generate_labels` tool scans the PR code changes and generates custom labels for the PR based on the content and context of the changes. It can be invoked manually by commenting on any PR: ``` /generate_labels ``` ## Example usage Invoke the tool manually by commenting `/generate_labels` on any PR: ![Generate Labels](https://codium.ai/images/pr_agent/generate_labels_comment.png){width=512} The tool will analyze the PR and add appropriate labels: ![Generate Labels Result](https://codium.ai/images/pr_agent/generate_labels_result.png){width=512} ## Configuration options The `generate_labels` tool uses configurations from the `[pr_description]` section for custom labels. ### Enabling custom labels To use custom labels, you need to enable them in the configuration: ```toml [config] enable_custom_labels = true ``` ### Defining custom labels You can define your own custom labels in the `[custom_labels]` section. See the [custom_labels.toml](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/custom_labels.toml) file for examples. Example configuration: ```toml [custom_labels."Bug fix"] description = "A fix for a bug in the codebase" [custom_labels."Feature"] description = "A new feature or enhancement" [custom_labels."Documentation"] description = "Documentation changes only" [custom_labels."Tests"] description = "Adding or modifying tests" [custom_labels."Refactoring"] description = "Code refactoring without functional changes" ``` ### How labels are applied 1. The tool analyzes the PR diff and commit messages 2. It uses AI to determine which labels best match the PR content 3. Labels are automatically applied to the PR (if the git provider supports it) 4. If labels cannot be applied directly, they are published as a comment ## Comparison with `/describe` labels The `/describe` tool also generates labels as part of its output. The key differences are: | Feature | `/generate_labels` | `/describe` | |---------|-------------------|-------------| | Purpose | Dedicated label generation | Full PR description with labels | | Output | Labels only | Title, summary, walkthrough, and labels | | Custom labels | ✅ Supported | ✅ Supported | | Use case | When you only need labels | When you want a complete PR description | ## Tips - Use custom labels that match your team's workflow and labeling conventions - Combine with automation to automatically label PRs when they are opened - Review the generated labels and adjust custom label descriptions if the AI consistently misclassifies PRs ================================================ FILE: docs/docs/tools/help.md ================================================ ## Overview The `help` tool provides a list of all the available tools and their descriptions. For PR-Agent users, it also enables to trigger each tool by checking the relevant box. It can be invoked manually by commenting on any PR: ``` /help ``` ## Example usage Invoke the `help` tool by commenting on a PR with: ![Help tool input](https://codium.ai/images/pr_agent/help1.png){width=750} Response will include a list of available tools: ![Help tool output](https://codium.ai/images/pr_agent/help2.png){width=750} ================================================ FILE: docs/docs/tools/help_docs.md ================================================ ## Overview The `help_docs` tool can answer a free-text question based on a git documentation folder. It can be invoked manually by commenting on any PR or Issue: ``` /help_docs "..." ``` Or configured to be triggered automatically when a [new issue is opened](#run-as-a-github-action). The tool assumes by default that the documentation is located in the root of the repository, at `/docs` folder. However, this can be customized by setting the `docs_path` configuration option: ```toml [pr_help_docs] repo_url = "" # The repository to use as context docs_path = "docs" # The documentation folder repo_default_branch = "main" # The branch to use in case repo_url overwritten ``` See more configuration options in the [Configuration options](#configuration-options) section. ## Example usage [//]: # (#### Asking a question about this repository:) [//]: # (![help_docs on the documentation of this repository](https://codium.ai/images/pr_agent/help_docs_comment.png){width=512}) **Asking a question about another repository** ![help_docs on the documentation of another repository](https://codium.ai/images/pr_agent/help_docs_comment_explicit_git.png){width=512} **Response**: ![help_docs response](https://codium.ai/images/pr_agent/help_docs_response.png){width=512} ## Run automatically when a new issue is opened You can configure PR-Agent to run `help_docs` automatically on any newly created issue. This can be useful, for example, for providing immediate feedback to users who open issues with questions on open-source projects with extensive documentation. Here's how: 1) Follow the steps depicted under [Run as a Github Action](../installation/github.md#run-as-a-github-action) to create a new workflow, such as:`.github/workflows/help_docs.yml`: 2) Edit your yaml file to the following: ```yaml name: Run pr agent on every opened issue, respond to user comments on an issue #When the action is triggered on: issues: types: [opened] #New issue # Read env. variables env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_API_URL: ${{ github.api_url }} GIT_REPO_URL: ${{ github.event.repository.clone_url }} ISSUE_URL: ${{ github.event.issue.html_url || github.event.comment.html_url }} ISSUE_BODY: ${{ github.event.issue.body || github.event.comment.body }} OPENAI_KEY: ${{ secrets.OPENAI_KEY }} # The actual set of actions jobs: issue_agent: runs-on: ubuntu-latest if: ${{ github.event.sender.type != 'Bot' }} #Do not respond to bots # Set required permissions permissions: contents: read # For reading repository contents issues: write # For commenting on issues steps: - name: Run PR Agent on Issues if: ${{ env.ISSUE_URL != '' }} uses: docker://codiumai/pr-agent:latest with: entrypoint: /bin/bash #Replace invoking cli.py directly with a shell args: | -c "cd /app && \ echo 'Running Issue Agent action step on ISSUE_URL=$ISSUE_URL' && \ export config__git_provider='github' && \ export github__user_token=$GITHUB_TOKEN && \ export github__base_url=$GITHUB_API_URL && \ export openai__key=$OPENAI_KEY && \ python -m pr_agent.cli --issue_url=$ISSUE_URL --pr_help_docs.repo_url="..." --pr_help_docs.docs_path="..." --pr_help_docs.openai_key=$OPENAI_KEY && \ help_docs "$ISSUE_BODY" ``` 3) Following completion of the remaining steps (such as adding secrets and relevant configurations, such as `repo_url` and `docs_path`) merge this change to your main branch. When a new issue is opened, you should see a comment from `github-actions` bot with an auto response, assuming the question is related to the documentation of the repository. --- ## Configuration options Under the section `pr_help_docs`, the [configuration file](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L199) contains options to customize the 'help docs' tool: - `repo_url`: If not overwritten, will use the repo from where the context came from (issue or PR), otherwise - use the given repo as context. - `repo_default_branch`: The branch to use in case repo_url overwritten, otherwise - has no effect. - `docs_path`: Relative path from root of repository (either the one this PR has been issued for, or above repo url). - `exclude_root_readme`: Whether or not to exclude the root README file for querying the model. - `supported_doc_exts` : Which file extensions should be included for the purpose of querying the model. --- ================================================ FILE: docs/docs/tools/improve.md ================================================ ## Overview The `improve` tool scans the PR code changes, and automatically generates meaningful suggestions for improving the PR code. The tool can be triggered automatically every time a new PR is [opened](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened), or it can be invoked manually by commenting on any PR: ```toml /improve ``` ## How it looks === "Suggestions Overview" ![code_suggestions_as_comment_closed](https://codium.ai/images/pr_agent/code_suggestions_as_comment_closed.png){width=512} === "Selecting a specific suggestion" ![code_suggestions_as_comment_open](https://codium.ai/images/pr_agent/code_suggestions_as_comment_open.png){width=512} ___ ## Example usage ### Manual triggering Invoke the tool manually by commenting `/improve` on any PR. The code suggestions by default are presented as a single comment: To edit [configurations](#configuration-options) related to the `improve` tool, use the following template: ```toml /improve --pr_code_suggestions.some_config1=... --pr_code_suggestions.some_config2=... ``` For example, you can choose to present all the suggestions as committable code comments, by running the following command: ```toml /improve --pr_code_suggestions.commitable_code_suggestions=true ``` ![improve](https://codium.ai/images/pr_agent/improve.png){width=512} ### Automatic triggering To run the `improve` automatically when a PR is opened, define in a [configuration file](../usage-guide/configuration_options.md#wiki-configuration-file): ```toml [github_app] pr_commands = [ "/improve", ... ] [pr_code_suggestions] num_code_suggestions_per_chunk = ... ... ``` - The `pr_commands` lists commands that will be executed automatically when a PR is opened. - The `[pr_code_suggestions]` section contains the configurations for the `improve` tool you want to edit (if any) ### Table vs Committable code comments PR-Agent supports two modes for presenting code suggestions: 1) [Table](https://codium.ai/images/pr_agent/code_suggestions_as_comment_closed.png) mode 2) [Inline Committable](https://codium.ai/images/pr_agent/improve.png) code comments mode. The table format offers several key advantages: - **Reduced noise**: Creates a cleaner PR experience with less clutter - **Quick overview and prioritization**: Enables quick review of one-liner summaries, impact levels, and easy prioritization - **High-level suggestions**: High-level suggestions that aren't tied to specific code chunks are presented only in the table mode - **Interactive features**: Provides 'more' and 'update' functionality via clickable buttons - **Centralized tracking**: Shows suggestion implementation status in one place - **IDE integration**: Allows applying suggestions directly in your IDE via the CLI tool Table mode is the default of PR-Agent, and is recommended approach for most users due to these benefits. ![code_suggestions_as_comment_closed.png](https://codium.ai/images/pr_agent/code_suggestions_as_comment_closed.png){width=512} Teams with specific preferences can enable committable code comments mode in their local configuration, or use [dual publishing mode](#dual-publishing-mode). > `Note - due to platform limitations, Bitbucket cloud and server supports only committable code comments mode.` ## `Extra instructions` and `best practices` The `improve` tool can be further customized by providing additional instructions and best practices to the AI model. ### Extra instructions You can use the `extra_instructions` configuration option to give the AI model additional instructions for the `improve` tool. Be specific, clear, and concise in the instructions. With extra instructions, you are the prompter. Examples for possible instructions: ```toml [pr_code_suggestions] extra_instructions="""\ (1) Answer in Japanese (2) Don't suggest to add try-except block (3) Ignore changes in toml files ... """ ``` Use triple quotes to write multi-line instructions. Use bullet points or numbers to make the instructions more readable. ### Best practices `Platforms supported: GitHub, GitLab, Bitbucket` PR-Agent supports both simple and hierarchical best practices configurations to provide guidance to the AI model for generating relevant code suggestions. ???- tip "Writing effective best practices files" The following guidelines apply to all best practices files: - Write clearly and concisely - Include brief code examples when helpful with before/after patterns - Focus on project-specific guidelines that will result in relevant suggestions you actually want to get - Keep each file relatively short, under 800 lines, since: - AI models may not process effectively very long documents - Long files tend to contain generic guidelines already known to AI - Maximum multiple file accumulated content is limited to 2000 lines. - Use pattern-based structure rather than simple bullet points for better clarity ???- tip "Example of a best practices file" Pattern 1: Add proper error handling with try-except blocks around external function calls. Example code before: ```python # Some code that might raise an exception return process_pr_data(data) ``` Example code after: ```python try: # Some code that might raise an exception return process_pr_data(data) except Exception as e: logger.exception("Failed to process request", extra={"error": e}) ``` Pattern 2: Add defensive null/empty checks before accessing object properties or performing operations on potentially null variables to prevent runtime errors. Example code before: ```python def get_pr_code(pr_data): if "changed_code" in pr_data: return pr_data.get("changed_code", "") return "" ``` Example code after: ```python def get_pr_code(pr_data): if pr_data is None: return "" if "changed_code" in pr_data: return pr_data.get("changed_code", "") return "" ``` #### Local best practices For basic usage, create a `best_practices.md` file in your repository's root directory containing a list of best practices, coding standards, and guidelines specific to your repository. The AI model will use this `best_practices.md` file as a reference, and in case the PR code violates any of the guidelines, it will create additional suggestions, with a dedicated label: `Organization best practice`. ### Combining 'extra instructions' and 'best practices' The `extra instructions` configuration is more related to the `improve` tool prompt. It can be used, for example, to avoid specific suggestions ("Don't suggest to add try-except block", "Ignore changes in toml files", ...) or to emphasize specific aspects or formats ("Answer in Japanese", "Give only short suggestions", ...) In contrast, the `best_practices.md` file is a general guideline for the way code should be written in the repo. Using a combination of both can help the AI model to provide relevant and tailored suggestions. ## Usage Tips ### Implementing the proposed code suggestions Each generated suggestion consists of three key elements: 1. A single-line summary of the proposed change 2. An expandable section containing a comprehensive description of the suggestion 3. A diff snippet showing the recommended code modification (before and after) We advise users to apply critical analysis and judgment when implementing the proposed suggestions. In addition to mistakes (which may happen, but are rare), sometimes the presented code modification may serve more as an _illustrative example_ than a directly applicable solution. In such cases, we recommend prioritizing the suggestion's detailed description, using the diff snippet primarily as a supporting reference. ### Dual publishing mode Our recommended approach for presenting code suggestions is through a [table](./improve.md#overview) (`--pr_code_suggestions.commitable_code_suggestions=false`). This method significantly reduces the PR footprint and allows for quick and easy digestion of multiple suggestions. We also offer a complementary **dual publishing mode**. When enabled, suggestions exceeding a certain score threshold are not only displayed in the table, but also presented as committable PR comments. This mode helps highlight suggestions deemed more critical. To activate dual publishing mode, use the following setting: ```toml [pr_code_suggestions] dual_publishing_score_threshold = x ``` Where x represents the minimum score threshold (>=) for suggestions to be presented as committable PR comments in addition to the table. Default is -1 (disabled). ### Self-review `Platforms supported: GitHub, GitLab` If you set in a configuration file: ```toml [pr_code_suggestions] demand_code_suggestions_self_review = true ``` The `improve` tool will add a checkbox below the suggestions, prompting user to acknowledge that they have reviewed the suggestions. You can set the content of the checkbox text via: ```toml [pr_code_suggestions] code_suggestions_self_review_text = "... (your text here) ..." ``` ![self_review_1](https://codium.ai/images/pr_agent/self_review_1.png){width=512} !!! tip "Tip - Reducing visual footprint after self-review" The configuration parameter `pr_code_suggestions.fold_suggestions_on_self_review` (default is True) can be used to automatically fold the suggestions after the user clicks the self-review checkbox. This reduces the visual footprint of the suggestions, and also indicates to the PR reviewer that the suggestions have been reviewed by the PR author, and don't require further attention. !!! tip "Tip - Demanding self-review from the PR author" By setting: ```toml [pr_code_suggestions] approve_pr_on_self_review = true ``` the tool can automatically add an approval when the PR author clicks the self-review checkbox. - If you set the number of required reviewers for a PR to 2, this effectively means that the PR author must click the self-review checkbox before the PR can be merged (in addition to a human reviewer). ![self_review_2](https://codium.ai/images/pr_agent/self_review_2.png){width=512} - If you keep the number of required reviewers for a PR to 1 and enable this configuration, this effectively means that the PR author can approve the PR by actively clicking the self-review checkbox. To prevent unauthorized approvals, this configuration defaults to false, and cannot be altered through online comments; enabling requires a direct update to the configuration file and a commit to the repository. This ensures that utilizing the feature demands a deliberate documented decision by the repository owner. ### How many code suggestions are generated? PR-Agent uses a dynamic strategy to generate code suggestions based on the size of the pull request (PR). Here's how it works: #### 1. Chunking large PRs - PR-Agent divides large PRs into 'chunks'. - Each chunk contains up to `config.max_model_tokens` tokens (default: 32,000). #### 2. Generating suggestions - For each chunk, PR-Agent generates up to `pr_code_suggestions.num_code_suggestions_per_chunk` suggestions (default: 3). This approach has two main benefits: - Scalability: The number of suggestions scales with the PR size, rather than being fixed. - Quality: By processing smaller chunks, the AI can maintain higher quality suggestions, as larger contexts tend to decrease AI performance. Note: Chunking is primarily relevant for large PRs. For most PRs (up to 600 lines of code), PR-Agent will be able to process the entire code in a single call. ## Configuration options ???+ example "General options"
extra_instructions Optional extra instructions to the tool. For example: "focus on the changes in the file X. Ignore change in ...".
commitable_code_suggestions If set to true, the tool will display the suggestions as committable code comments. Default is false.
dual_publishing_score_threshold Minimum score threshold for suggestions to be presented as committable PR comments in addition to the table. Default is -1 (disabled).
focus_only_on_problems If set to true, suggestions will focus primarily on identifying and fixing code problems, and less on style considerations like best practices, maintainability, or readability. Default is true.
persistent_comment If set to true, the improve comment will be persistent, meaning that every new improve request will edit the previous one. Default is true.
suggestions_score_threshold Any suggestion with importance score less than this threshold will be removed. Default is 0. Highly recommend not to set this value above 7-8, since above it may clip relevant suggestions that can be useful.
enable_help_text If set to true, the tool will display a help text in the comment. Default is false.
enable_chat_text If set to true, the tool will display a reference to the PR chat in the comment. Default is false.
publish_output_no_suggestions If set to true, the tool will publish a comment even if no suggestions were found. Default is true.
???+ example "Params for number of suggestions and AI calls"
num_code_suggestions_per_chunk Number of code suggestions provided by the 'improve' tool, per chunk. Default is 3.
max_number_of_calls Maximum number of chunks. Default is 3.
## Understanding AI Code Suggestions - **AI Limitations:** AI models for code are getting better and better, but they are not flawless. Not all the suggestions will be perfect, and a user should not accept all of them automatically. Critical reading and judgment are required. Mistakes of the AI are rare but can happen, and it is usually quite easy for a human to spot them. - **Purpose of Suggestions:** - **Self-reflection:** The suggestions aim to enable developers to _self-reflect_ and improve their pull requests. This process can help to identify blind spots, uncover missed edge cases, and enhance code readability and coherency. Even when a specific code suggestion isn't suitable, the underlying issue it highlights often reveals something important that might deserve attention. - **Bug detection:** The suggestions also alert on any _critical bugs_ that may have been identified during the analysis. This provides an additional safety net to catch potential issues before they make it into production. It's perfectly acceptable to implement only the suggestions you find valuable for your specific context. - **Hierarchy:** Presenting the suggestions in a structured hierarchical table enables the user to _quickly_ understand them, and to decide which ones are relevant and which are not. - **Customization:** To guide the model to suggestions that are more relevant to the specific needs of your project, we recommend using the [`extra_instructions`](./improve.md#extra-instructions-and-best-practices) and [`best practices`](./improve.md#best-practices) fields. - **Model Selection:** For specific programming languages or use cases, some models may perform better than others. ================================================ FILE: docs/docs/tools/index.md ================================================ # Tools Here is a list of PR-Agent tools, each with a dedicated page that explains how to use it: | Tool | Description | |------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------| | **[PR Description (`/describe`)](./describe.md)** | Automatically generating PR description - title, type, summary, code walkthrough and labels | | **[PR Review (`/review`)](./review.md)** | Adjustable feedback about the PR, possible issues, security concerns, review effort and more | | **[Code Suggestions (`/improve`)](./improve.md)** | Code suggestions for improving the PR | | **[Question Answering (`/ask ...`)](./ask.md)** | Answering free-text questions about the PR, or on specific code lines | | **[Add Documentation (`/add_docs`)](./add_docs.md)** | Generate documentation for code components that are missing it | | **[Generate Labels (`/generate_labels`)](./generate_labels.md)** | Generate custom labels for the PR based on the code changes | | **[Similar Issues (`/similar_issue`)](./similar_issues.md)** | Find similar issues in the repository based on the current issue | | **[Help (`/help`)](./help.md)** | Provides a list of all the available tools | | **[Help Docs (`/help_docs`)](./help_docs.md)** | Answer a free-text question based on a git documentation folder | | **[Update Changelog (`/update_changelog`)](./update_changelog.md)** | Automatically updating the CHANGELOG.md file with the PR changes | ================================================ FILE: docs/docs/tools/review.md ================================================ ## Overview The `review` tool scans the PR code changes, and generates feedback about the PR, aiming to aid the reviewing process.
The tool can be triggered automatically every time a new PR is [opened](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened), or can be invoked manually by commenting on any PR: ``` /review ``` Note that the main purpose of the `review` tool is to provide the **PR reviewer** with useful feedback and insights. The PR author, in contrast, may prefer to save time and focus on the output of the [improve](./improve.md) tool, which provides actionable code suggestions. (Read more about the different personas in the PR process and how PR-Agent aims to assist them in our [blog](https://www.codium.ai/blog/understanding-the-challenges-and-pain-points-of-the-pull-request-cycle/)) ## Example usage ### Manual triggering Invoke the tool manually by commenting `/review` on any PR: ![review comment](https://codium.ai/images/pr_agent/review_comment.png){width=512} After ~30 seconds, the tool will generate a review for the PR: ![review](https://codium.ai/images/pr_agent/review3.png){width=512} If you want to edit [configurations](#configuration-options), add the relevant ones to the command: ``` /review --pr_reviewer.some_config1=... --pr_reviewer.some_config2=... ``` ### Automatic triggering To run the `review` automatically when a PR is opened, define in a [configuration file](../usage-guide/configuration_options.md#wiki-configuration-file): ``` [github_app] pr_commands = [ "/review", ... ] [pr_reviewer] extra_instructions = "..." ... ``` - The `pr_commands` lists commands that will be executed automatically when a PR is opened. - The `[pr_reviewer]` section contains the configurations for the `review` tool you want to edit (if any). ## Configuration options ???+ example "General options"
persistent_comment If set to true, the review comment will be persistent, meaning that every new review request will edit the previous one. Default is true.
final_update_message When set to true, updating a persistent review comment during online commenting will automatically add a short comment with a link to the updated review in the pull request .Default is true.
extra_instructions Optional extra instructions to the tool. For example: "focus on the changes in the file X. Ignore change in ...".
enable_help_text If set to true, the tool will display a help text in the comment. Default is false.
num_max_findings Number of maximum returned findings. Default is 3.
???+ example "Enable\\disable specific sub-sections"
require_score_review If set to true, the tool will add a section that scores the PR. Default is false.
require_tests_review If set to true, the tool will add a section that checks if the PR contains tests. Default is true.
require_estimate_effort_to_review If set to true, the tool will add a section that estimates the effort needed to review the PR. Default is true.
require_estimate_contribution_time_cost If set to true, the tool will add a section that estimates the time required for a senior developer to create and submit such changes. Default is false.
require_can_be_split_review If set to true, the tool will add a section that checks if the PR contains several themes, and can be split into smaller PRs. Default is false.
require_security_review If set to true, the tool will add a section that checks if the PR contains a possible security or vulnerability issue. Default is true.
require_todo_scan If set to true, the tool will add a section that lists TODO comments found in the PR code changes. Default is false.
require_ticket_analysis_review If set to true, and the PR contains a GitHub or Jira ticket link, the tool will add a section that checks if the PR in fact fulfilled the ticket requirements. Default is true.
???+ example "Adding PR labels" You can enable\disable the `review` tool to add specific labels to the PR:
enable_review_labels_security If set to true, the tool will publish a 'possible security issue' label if it detects a security issue. Default is true.
enable_review_labels_effort If set to true, the tool will publish a 'Review effort x/5' label (1–5 scale). Default is true.
## Usage Tips ### General guidelines !!! tip "" The `review` tool provides a collection of configurable feedbacks about a PR. It is recommended to review the [Configuration options](#configuration-options) section, and choose the relevant options for your use case. Some of the features that are disabled by default are quite useful, and should be considered for enabling. For example: `require_score_review`, and more. On the other hand, if you find one of the enabled features to be irrelevant for your use case, disable it. No default configuration can fit all use cases. ### Automation !!! tip "" When you first install PR-Agent app, the [default mode](../usage-guide/automations_and_usage.md#github-app-automatic-tools-when-a-new-pr-is-opened) for the `review` tool is: ``` pr_commands = ["/review", ...] ``` Meaning the `review` tool will run automatically on every PR, without any additional configurations. Edit this field to enable/disable the tool, or to change the configurations used. ### Auto-generated PR labels by the Review Tool !!! tip "" The `review` can tool automatically add labels to your Pull Requests: - **`possible security issue`**: This label is applied if the tool detects a potential [security vulnerability](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/pr_reviewer_prompts.toml#L121) in the PR's code. This feedback is controlled by the 'enable_review_labels_security' flag (default is true). - **`review effort [x/5]`**: This label estimates the [effort](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/pr_reviewer_prompts.toml#L105) required to review the PR on a relative scale of 1 to 5, where 'x' represents the assessed effort. This feedback is controlled by the 'enable_review_labels_effort' flag (default is true). - **`ticket compliance`**: Adds a label indicating code compliance level ("Fully compliant" | "PR Code Verified" | "Partially compliant" | "Not compliant") to any GitHub/Jira/Linea ticket linked in the PR. Controlled by the 'require_ticket_labels' flag (default: false). If 'require_no_ticket_labels' is also enabled, PRs without ticket links will receive a "No ticket found" label. ### Auto-blocking PRs from being merged based on the generated labels !!! tip "" You can configure a CI/CD Action to prevent merging PRs with specific labels. For example, implement a dedicated [GitHub Action](https://medium.com/sequra-tech/quick-tip-block-pull-request-merge-using-labels-6cc326936221). This approach helps ensure PRs with potential security issues or ticket compliance problems will not be merged without further review. Since AI may make mistakes or lack complete context, use this feature judiciously. For flexibility, users with appropriate permissions can remove generated labels when necessary. When a label is removed, this action will be automatically documented in the PR discussion, clearly indicating it was a deliberate override by an authorized user to allow the merge. ### Extra instructions !!! tip "" Extra instructions are important. The `review` tool can be configured with extra instructions, which can be used to guide the model to a feedback tailored to the needs of your project. Be specific, clear, and concise in the instructions. With extra instructions, you are the prompter. Specify the relevant sub-tool, and the relevant aspects of the PR that you want to emphasize. Examples of extra instructions: ``` [pr_reviewer] extra_instructions="""\ In the code feedback section, emphasize the following: - Does the code logic cover relevant edge cases? - Is the code logic clear and easy to understand? - Is the code logic efficient? ... """ ``` Use triple quotes to write multi-line instructions. Use bullet points to make the instructions more readable. ================================================ FILE: docs/docs/tools/similar_issues.md ================================================ ## Overview The similar issue tool retrieves the most similar issues to the current issue. It can be invoked manually by commenting on any PR: ``` /similar_issue ``` ## Example usage ![similar_issue_original_issue](https://codium.ai/images/pr_agent/similar_issue_original_issue.png){width=768} ![similar_issue_comment](https://codium.ai/images/pr_agent/similar_issue_comment.png){width=768} ![similar_issue](https://codium.ai/images/pr_agent/similar_issue.png){width=768} Note that to perform retrieval, the `similar_issue` tool indexes all the repo previous issues (once). ### Selecting a Vector Database Configure your preferred database by changing the `pr_similar_issue` parameter in `configuration.toml` file. #### Available Options Choose from the following Vector Databases: 1. LanceDB 2. Pinecone 3. Qdrant #### Pinecone Configuration To use Pinecone with the `similar issue` tool, add these credentials to `.secrets.toml` (or set as environment variables): ``` [pinecone] api_key = "..." environment = "..." ``` These parameters can be obtained by registering to [Pinecone](https://app.pinecone.io/?sessionType=signup/). #### Qdrant Configuration To use Qdrant with the `similar issue` tool, add these credentials to `.secrets.toml` (or set as environment variables): ``` [qdrant] url = "https://YOUR-QDRANT-URL" # e.g., https://xxxxxxxx-xxxxxxxx.eu-central-1-0.aws.cloud.qdrant.io api_key = "..." ``` Then select Qdrant in `configuration.toml`: ``` [pr_similar_issue] vectordb = "qdrant" ``` You can get a free managed Qdrant instance from [Qdrant Cloud](https://cloud.qdrant.io/). ## How to use - To invoke the 'similar issue' tool from **CLI**, run: `python3 cli.py --issue_url=... similar_issue` - To invoke the 'similar' issue tool via online usage, [comment](https://github.com/qodo-ai/pr-agent/issues/178#issuecomment-1716934893) on a PR: `/similar_issue` - You can also enable the 'similar issue' tool to run automatically when a new issue is opened, by adding it to the [pr_commands list in the github_app section](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L229) ================================================ FILE: docs/docs/tools/update_changelog.md ================================================ ## Overview The `update_changelog` tool automatically updates the CHANGELOG.md file with the PR changes. It can be invoked manually by commenting on any PR: ``` /update_changelog ``` ## Example usage ![update_changelog_comment](https://codium.ai/images/pr_agent/update_changelog_comment.png){width=768} ![update_changelog](https://codium.ai/images/pr_agent/update_changelog.png){width=768} ## Configuration options Under the section `pr_update_changelog`, the [configuration file](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L169) contains options to customize the 'update changelog' tool: - `push_changelog_changes`: whether to push the changes to CHANGELOG.md, or just publish them as a comment. Default is false (publish as comment). - `extra_instructions`: Optional extra instructions to the tool. For example: "Use the following structure: ..." - `add_pr_link`: whether the model should try to add a link to the PR in the changelog. Default is true. - `skip_ci_on_push`: whether the commit message (when `push_changelog_changes` is true) will include the term "[skip ci]", preventing CI tests to be triggered on the changelog commit. Default is true. ================================================ FILE: docs/docs/usage-guide/EXAMPLE_BEST_PRACTICE.md ================================================ ## Recommend Python Best Practices This document outlines a series of recommended best practices for Python development. These guidelines aim to improve code quality, maintainability, and readability. ### Imports Use `import` statements for packages and modules only, not for individual types, classes, or functions. #### Definition Reusability mechanism for sharing code from one module to another. #### Decision - Use `import x` for importing packages and modules. - Use `from x import y` where `x` is the package prefix and `y` is the module name with no prefix. - Use `from x import y as z` in any of the following circumstances: - Two modules named `y` are to be imported. - `y` conflicts with a top-level name defined in the current module. - `y` conflicts with a common parameter name that is part of the public API (e.g., `features`). - `y` is an inconveniently long name, or too generic in the context of your code - Use `import y as z` only when `z` is a standard abbreviation (e.g., `import numpy as np`). For example the module `sound.effects.echo` may be imported as follows: ``` from sound.effects import echo ... echo.EchoFilter(input, output, delay=0.7, atten=4) ``` Do not use relative names in imports. Even if the module is in the same package, use the full package name. This helps prevent unintentionally importing a package twice. ##### Exemptions Exemptions from this rule: - Symbols from the following modules are used to support static analysis and type checking: - [`typing` module](https://google.github.io/styleguide/pyguide.html#typing-imports) - [`collections.abc` module](https://google.github.io/styleguide/pyguide.html#typing-imports) - [`typing_extensions` module](https://github.com/python/typing_extensions/blob/main/README.md) - Redirects from the [six.moves module](https://six.readthedocs.io/#module-six.moves). ### Packages Import each module using the full pathname location of the module. #### Decision All new code should import each module by its full package name. Imports should be as follows: ``` Yes: # Reference absl.flags in code with the complete name (verbose). import absl.flags from doctor.who import jodie _FOO = absl.flags.DEFINE_string(...) ``` ``` Yes: # Reference flags in code with just the module name (common). from absl import flags from doctor.who import jodie _FOO = flags.DEFINE_string(...) ``` _(assume this file lives in `doctor/who/` where `jodie.py` also exists)_ ``` No: # Unclear what module the author wanted and what will be imported. The actual # import behavior depends on external factors controlling sys.path. # Which possible jodie module did the author intend to import? import jodie ``` The directory the main binary is located in should not be assumed to be in `sys.path` despite that happening in some environments. This being the case, code should assume that `import jodie` refers to a third-party or top-level package named `jodie`, not a local `jodie.py`. ### Default Iterators and Operators Use default iterators and operators for types that support them, like lists, dictionaries, and files. #### Definition Container types, like dictionaries and lists, define default iterators and membership test operators (“in” and “not in”). #### Decision Use default iterators and operators for types that support them, like lists, dictionaries, and files. The built-in types define iterator methods, too. Prefer these methods to methods that return lists, except that you should not mutate a container while iterating over it. ``` Yes: for key in adict: ... if obj in alist: ... for line in afile: ... for k, v in adict.items(): ... ``` ``` No: for key in adict.keys(): ... for line in afile.readlines(): ... ``` ### Lambda Functions Okay for one-liners. Prefer generator expressions over `map()` or `filter()` with a `lambda`. #### Decision Lambdas are allowed. If the code inside the lambda function spans multiple lines or is longer than 60-80 chars, it might be better to define it as a regular [nested function](https://google.github.io/styleguide/pyguide.html#lexical-scoping). For common operations like multiplication, use the functions from the `operator` module instead of lambda functions. For example, prefer `operator.mul` to `lambda x, y: x * y`. ### Default Argument Values Okay in most cases. #### Definition You can specify values for variables at the end of a function’s parameter list, e.g., `def foo(a, b=0):`. If `foo` is called with only one argument, `b` is set to 0. If it is called with two arguments, `b` has the value of the second argument. #### Decision Okay to use with the following caveat: Do not use mutable objects as default values in the function or method definition. ``` Yes: def foo(a, b=None): if b is None: b = [] Yes: def foo(a, b: Sequence | None = None): if b is None: b = [] Yes: def foo(a, b: Sequence = ()): # Empty tuple OK since tuples are immutable. ... ``` ``` from absl import flags _FOO = flags.DEFINE_string(...) No: def foo(a, b=[]): ... No: def foo(a, b=time.time()): # Is `b` supposed to represent when this module was loaded? ... No: def foo(a, b=_FOO.value): # sys.argv has not yet been parsed... ... No: def foo(a, b: Mapping = {}): # Could still get passed to unchecked code. ... ``` ### True/False Evaluations Use the “implicit” false if possible, e.g., `if foo:` rather than `if foo != []:` ### Lexical Scoping Okay to use. An example of the use of this feature is: ``` def get_adder(summand1: float) -> Callable[[float], float]: """Returns a function that adds numbers to a given number.""" def adder(summand2: float) -> float: return summand1 + summand2 return adder ``` #### Decision Okay to use. ### Threading Do not rely on the atomicity of built-in types. While Python’s built-in data types such as dictionaries appear to have atomic operations, there are corner cases where they aren’t atomic (e.g. if `__hash__` or `__eq__` are implemented as Python methods) and their atomicity should not be relied upon. Neither should you rely on atomic variable assignment (since this in turn depends on dictionaries). Use the `queue` module’s `Queue` data type as the preferred way to communicate data between threads. Otherwise, use the `threading` module and its locking primitives. Prefer condition variables and `threading.Condition` instead of using lower-level locks. ================================================ FILE: docs/docs/usage-guide/additional_configurations.md ================================================ ## Show possible configurations The possible configurations of PR-Agent are stored in [here](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml){:target="_blank"}. In the [tools](../tools/index.md) page you can find explanations on how to use these configurations for each tool. To print all the available configurations as a comment on your PR, you can use the following command: ``` /config ``` ![possible_config1](https://codium.ai/images/pr_agent/possible_config1.png){width=512} To view the **actual** configurations used for a specific tool, after all the user settings are applied, you can add for each tool a `--config.output_relevant_configurations=true` suffix. For example: ``` /improve --config.output_relevant_configurations=true ``` Will output an additional field showing the actual configurations used for the `improve` tool. ![possible_config2](https://codium.ai/images/pr_agent/possible_config2.png){width=512} ## Ignoring files from analysis In some cases, you may want to exclude specific files or directories from the analysis performed by PR-Agent. This can be useful, for example, when you have files that are generated automatically or files that shouldn't be reviewed, like vendor code. You can ignore files or folders using the following methods: - `IGNORE.GLOB` - `IGNORE.REGEX` which you can edit to ignore files or folders based on glob or regex patterns. ### Example usage Let's look at an example where we want to ignore all files with `.py` extension from the analysis. To ignore Python files in a PR with online usage, comment on a PR: `/review --ignore.glob="['*.py']"` To ignore Python files in all PRs using `glob` pattern, set in a configuration file: ``` [ignore] glob = ['*.py'] ``` And to ignore Python files in all PRs using `regex` pattern, set in a configuration file: ``` [ignore] regex = ['.*\.py$'] ``` ## Extra instructions All PR-Agent tools have a parameter called `extra_instructions`, that enables to add free-text extra instructions. Example usage: ``` /update_changelog --pr_update_changelog.extra_instructions="Make sure to update also the version ..." ``` ## Language Settings The default response language for PR-Agent is **U.S. English**. However, some development teams may prefer to display information in a different language. For example, your team's workflow might improve if PR descriptions and code suggestions are set to your country's native language. To configure this, set the `response_language` parameter in the configuration file. This will prompt the model to respond in the specified language. Use a **standard locale code** based on [ISO 3166](https://en.wikipedia.org/wiki/ISO_3166) (country codes) and [ISO 639](https://en.wikipedia.org/wiki/ISO_639) (language codes) to define a language-country pair. See this [comprehensive list of locale codes](https://simplelocalize.io/data/locales/). Example: ```toml [config] response_language = "it-IT" ``` This will set the response language globally for all the commands to Italian. > **Important:** Note that only dynamic text generated by the AI model is translated to the configured language. Static text such as labels and table headers that are not part of the AI models response will remain in US English. In addition, the model you are using must have good support for the specified language. [//]: # (## Working with large PRs) [//]: # () [//]: # (The default mode of CodiumAI is to have a single call per tool, using GPT-4, which has a token limit of 8000 tokens.) [//]: # (This mode provides a very good speed-quality-cost tradeoff, and can handle most PRs successfully.) [//]: # (When the PR is above the token limit, it employs a [PR Compression strategy](../core-abilities/index.md).) [//]: # () [//]: # (However, for very large PRs, or in case you want to emphasize quality over speed and cost, there are two possible solutions:) [//]: # (1) [Use a model](./changing_a_model.md) with larger context, like GPT-32K, or claude-100K. This solution will be applicable for all the tools.) [//]: # (2) For the `/improve` tool, there is an ['extended' mode](../tools/improve.md) (`/improve --extended`),) [//]: # (which divides the PR into chunks, and processes each chunk separately. With this mode, regardless of the model, no compression will be done (but for large PRs, multiple model calls may occur)) ## Expand GitLab submodule diffs By default, GitLab merge requests show submodule updates as `Subproject commit` lines. To include the actual file-level changes from those submodules in PR-Agent analysis, enable: ```toml [gitlab] expand_submodule_diffs = true ``` When enabled, PR-Agent will fetch and attach diffs from the submodule repositories. The default is `false` to avoid extra GitLab API calls. ## Log Level PR-Agent allows you to control the verbosity of logging by using the `log_level` configuration parameter. This is particularly useful for troubleshooting and debugging issues with your PR workflows. ``` [config] log_level = "DEBUG" # Options: "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL" ``` The default log level is "DEBUG", which provides detailed output of all operations. If you prefer less verbose logs, you can set higher log levels like "INFO" or "WARNING". ## Integrating with Logging Observability Platforms Various logging observability tools can be used out-of-the box when using the default LiteLLM AI Handler. Simply configure the LiteLLM callback settings in `configuration.toml` and set environment variables according to the LiteLLM [documentation](https://docs.litellm.ai/docs/). For example, to use [LangSmith](https://www.langchain.com/langsmith) you can add the following to your `configuration.toml` file: ``` [litellm] enable_callbacks = true success_callback = ["langsmith"] failure_callback = ["langsmith"] service_callback = [] ``` Then set the following environment variables: ``` LANGSMITH_API_KEY= LANGSMITH_PROJECT= LANGSMITH_BASE_URL= ``` ## Bringing additional repository metadata to PR-Agent To provide PR-Agent tools with additional context about your project, you can enable automatic repository metadata detection. If you set: ```toml [config] add_repo_metadata = true ``` PR-Agent automatically searches for repository metadata files in your PR's head branch root directory. By default, it looks for: [AGENTS.MD](https://agents.md/), [QODO.MD](https://docs.codium.ai/qodo-documentation/qodo-command/getting-started/setup-and-quickstart), [CLAUDE.MD](https://www.anthropic.com/engineering/claude-code-best-practices). You can also specify custom filenames to search for: ```toml [config] add_repo_metadata_file_list= ["file1.md", "file2.md", ...] ``` ## Ignoring automatic commands in PRs PR-Agent allows you to automatically ignore certain PRs based on various criteria: - PRs with specific titles (using regex matching) - PRs between specific branches (using regex matching) - PRs from specific repositories (using regex matching) - PRs not from specific folders - PRs containing specific labels - PRs opened by specific users ### Ignoring PRs with specific titles To ignore PRs with a specific title such as "[Bump]: ...", you can add the following to your `configuration.toml` file: ```toml [config] ignore_pr_title = ["\\[Bump\\]"] ``` Where the `ignore_pr_title` is a list of regex patterns to match the PR title you want to ignore. Default is `ignore_pr_title = ["^\\[Auto\\]", "^Auto"]`. ### Ignoring PRs between specific branches To ignore PRs from specific source or target branches, you can add the following to your `configuration.toml` file: ```toml [config] ignore_pr_source_branches = ['develop', 'main', 'master', 'stage'] ignore_pr_target_branches = ["qa"] ``` Where the `ignore_pr_source_branches` and `ignore_pr_target_branches` are lists of regex patterns to match the source and target branches you want to ignore. They are not mutually exclusive, you can use them together or separately. ### Ignoring PRs from specific repositories To ignore PRs from specific repositories, you can add the following to your `configuration.toml` file: ```toml [config] ignore_repositories = ["my-org/my-repo1", "my-org/my-repo2"] ``` Where the `ignore_repositories` is a list of regex patterns to match the repositories you want to ignore. This is useful when you have multiple repositories and want to exclude certain ones from analysis. ### Ignoring PRs not from specific folders To allow only specific folders (often needed in large monorepos), set: ``` [config] allow_only_specific_folders=['folder1','folder2'] ``` For the configuration above, automatic feedback will only be triggered when the PR changes include files where 'folder1' or 'folder2' is in the file path ### Ignoring PRs containing specific labels To ignore PRs containing specific labels, you can add the following to your `configuration.toml` file: ``` [config] ignore_pr_labels = ["do-not-merge"] ``` Where the `ignore_pr_labels` is a list of labels that when present in the PR, the PR will be ignored. ### Ignoring PRs from specific users PR-Agent tries to automatically identify and ignore pull requests created by bots using: - GitHub's native bot detection system - Name-based pattern matching While this detection is robust, it may not catch all cases, particularly when: - Bots are registered as regular user accounts - Bot names don't match common patterns To supplement the automatic bot detection, you can manually specify users to ignore. Add the following to your `configuration.toml` file to ignore PRs from specific users: ``` [config] ignore_pr_authors = ["my-special-bot-user", ...] ``` Where the `ignore_pr_authors` is a regex list of usernames that you want to ignore. !!! note There is one specific case where bots will receive an automatic response - when they generated a PR with a _failed test_. ### Ignoring Generated Files by Language/Framework To automatically exclude files generated by specific languages or frameworks, you can add the following to your `configuration.toml` file: ``` [config] ignore_language_framework = ['protobuf', ...] ``` You can view the list of auto-generated file patterns in [`generated_code_ignore.toml`](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/generated_code_ignore.toml). Files matching these glob patterns will be automatically excluded from PR Agent analysis. ### Ignoring Tickets with Specific Labels When PR-Agent analyzes tickets (JIRA, GitHub Issues, GitLab Issues, etc.) referenced in your PR, you may want to exclude tickets that have certain labels from the analysis. This is useful for filtering out tickets marked as "ignore-compliance", "skip-review", or other labels that indicate the ticket should not be considered during PR review. To ignore tickets with specific labels, add the following to your `configuration.toml` file: ```toml [config] ignore_ticket_labels = ["ignore-compliance", "skip-review", "wont-fix"] ``` Where `ignore_ticket_labels` is a list of label names that should be ignored during ticket analysis. ================================================ FILE: docs/docs/usage-guide/automations_and_usage.md ================================================ ## Local repo (CLI) When running from your locally cloned PR-Agent repo (CLI), your local configuration file will be used. Examples of invoking the different tools via the CLI: - **Review**: `python -m pr_agent.cli --pr_url= review` - **Describe**: `python -m pr_agent.cli --pr_url= describe` - **Improve**: `python -m pr_agent.cli --pr_url= improve` - **Ask**: `python -m pr_agent.cli --pr_url= ask "Write me a poem about this PR"` - **Update Changelog**: `python -m pr_agent.cli --pr_url= update_changelog` `` is the url of the relevant PR (for example: [#50](https://github.com/qodo-ai/pr-agent/pull/50)). **Notes:** 1. in addition to editing your local configuration file, you can also change any configuration value by adding it to the command line: ``` python -m pr_agent.cli --pr_url= /review --pr_reviewer.extra_instructions="focus on the file: ..." ``` 2. You can print results locally, without publishing them, by setting in `configuration.toml`: ``` [config] publish_output=false verbosity_level=2 ``` This is useful for debugging or experimenting with different tools. 3. **git provider**: The [git_provider](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L12) field in a configuration file determines the GIT provider that will be used by PR-Agent. Currently, the following providers are supported: `github` **(default)**, `gitlab`, `bitbucket`, `azure`, `codecommit`, `local`, and `gitea`. ### CLI Health Check To verify that PR-Agent has been configured correctly, you can run this health check command from the repository root: ```bash python -m tests.health_test.main ``` If the health check passes, you will see the following output: ``` ======== Health test passed successfully ======== ``` At the end of the run. Before running the health check, ensure you have: - Configured your [LLM provider](./changing_a_model.md) - Added a valid GitHub token to your configuration file ## Online usage Online usage means invoking PR-Agent tools by [comments](https://github.com/qodo-ai/pr-agent/pull/229#issuecomment-1695021901) on a PR. Commands for invoking the different tools via comments: - **Review**: `/review` - **Describe**: `/describe` - **Improve**: `/improve` (or `/improve_code` for bitbucket, since `/improve` is sometimes reserved) - **Ask**: `/ask "..."` - **Update Changelog**: `/update_changelog` To edit a specific configuration value, just add `--config_path=` to any command. For example, if you want to edit the `review` tool configurations, you can run: ``` /review --pr_reviewer.extra_instructions="..." --pr_reviewer.require_score_review=false ``` Any configuration value in [configuration file](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) file can be similarly edited. Comment `/config` to see the list of available configurations. ## PR-Agent Automatic Feedback ### Disabling all automatic feedback To easily disable all automatic feedback from PR-Agent (GitHub App, GitLab Webhook, BitBucket App, Azure DevOps Webhook), set in a configuration file: ```toml [config] disable_auto_feedback = true ``` When this parameter is set to `true`, PR-Agent will not run any automatic tools (like `describe`, `review`, `improve`) when a new PR is opened, or when new code is pushed to an open PR. ### GitHub App !!! note "Configurations for PR-Agent" PR-Agent for GitHub is an App, hosted by Codium. So all the instructions below are relevant for PR-Agent users. Same goes for [GitLab webhook](#gitlab-webhook) and [BitBucket App](#bitbucket-app) sections. #### GitHub app automatic tools when a new PR is opened The [github_app](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L223) section defines GitHub app specific configurations. The configuration parameter `pr_commands` defines the list of tools that will be **run automatically** when a new PR is opened: ```toml [github_app] pr_commands = [ "/describe", "/review", "/improve", ] ``` This means that when a new PR is opened/reopened or marked as ready for review, PR-Agent will run the `describe`, `review` and `improve` tools. **Draft PRs:** By default, draft PRs are not considered for automatic tools, but you can change this by setting the `feedback_on_draft_pr` parameter to `true` in the configuration file. ```toml [github_app] feedback_on_draft_pr = true ``` **Changing default tool parameters:** You can override the default tool parameters by using one the three options for a [configuration file](./configuration_options.md): **wiki**, **local**, or **global**. For example, if your configuration file contains: ```toml [pr_description] generate_ai_title = true ``` Every time you run the `describe` tool (including automatic runs) the PR title will be generated by the AI. **Parameters for automated runs:** You can customize configurations specifically for automated runs by using the `--config_path=` parameter. For instance, to modify the `review` tool settings only for newly opened PRs, use: ```toml [github_app] pr_commands = [ "/describe", "/review --pr_reviewer.extra_instructions='focus on the file: ...'", "/improve", ] ``` #### GitHub app automatic tools for push actions (commits to an open PR) In addition to running automatic tools when a PR is opened, the GitHub app can also respond to new code that is pushed to an open PR. The configuration toggle `handle_push_trigger` can be used to enable this feature. The configuration parameter `push_commands` defines the list of tools that will be **run automatically** when new code is pushed to the PR. ```toml [github_app] handle_push_trigger = true push_commands = [ "/describe", "/review", ] ``` This means that when new code is pushed to the PR, PR-Agent will run the `describe` and `review` tools, with the specified parameters. ### GitHub Action `GitHub Action` is a different way to trigger PR-Agent tools, and uses a different configuration mechanism than `GitHub App`.
You can configure settings for `GitHub Action` by adding environment variables under the env section in `.github/workflows/pr_agent.yml` file. Specifically, start by setting the following environment variables: ```yaml env: OPENAI_KEY: ${{ secrets.OPENAI_KEY }} # Make sure to add your OpenAI key to your repo secrets GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Make sure to add your GitHub token to your repo secrets github_action_config.auto_review: "true" # enable\disable auto review github_action_config.auto_describe: "true" # enable\disable auto describe github_action_config.auto_improve: "true" # enable\disable auto improve github_action_config.pr_actions: '["opened", "reopened", "ready_for_review", "review_requested"]' ``` `github_action_config.auto_review`, `github_action_config.auto_describe` and `github_action_config.auto_improve` are used to enable/disable automatic tools that run when a new PR is opened. If not set, the default configuration is for all three tools to run automatically when a new PR is opened. `github_action_config.pr_actions` is used to configure which `pull_requests` events will trigger the enabled auto flags If not set, the default configuration is `["opened", "reopened", "ready_for_review", "review_requested"]` `github_action_config.enable_output` are used to enable/disable github actions [output parameter](https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#outputs-for-docker-container-and-javascript-actions) (default is `true`). Review result is output as JSON to `steps.{step-id}.outputs.review` property. The JSON structure is equivalent to the yaml data structure defined in [pr_reviewer_prompts.toml](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/pr_reviewer_prompts.toml). Note that you can give additional config parameters by adding environment variables to `.github/workflows/pr_agent.yml`, or by using a `.pr_agent.toml` [configuration file](./configuration_options.md#global-configuration-file) in the root of your repo For example, you can set an environment variable: `pr_description.publish_labels=false`, or add a `.pr_agent.toml` file with the following content: ```toml [pr_description] publish_labels = false ``` to prevent PR-Agent from publishing labels when running the `describe` tool. #### Enable using commands in PR You can configure your GitHub Actions workflow to trigger on `issue_comment` [events](https://docs.github.com/en/actions/reference/workflows-and-actions/events-that-trigger-workflows#issue_comment) (`created` and `edited`). Example GitHub Actions workflow configuration: ```yaml on: issue_comment: types: [created, edited] ``` When this is configured, PR-Agent can be invoked by commenting on the PR. #### Quick Reference: Model Configuration in GitHub Actions For detailed step-by-step examples of configuring different models (Gemini, Claude, Azure OpenAI, etc.) in GitHub Actions, see the [Configuration Examples](../installation/github.md#configuration-examples) section in the installation guide. **Common Model Configuration Patterns:** - **OpenAI**: Set `config.model: "gpt-4o"` and `OPENAI_KEY` - **Gemini**: Set `config.model: "gemini/gemini-1.5-flash"` and `GOOGLE_AI_STUDIO.GEMINI_API_KEY` (no `OPENAI_KEY` needed) - **Claude**: Set `config.model: "anthropic/claude-3-opus-20240229"` and `ANTHROPIC.KEY` (no `OPENAI_KEY` needed) - **Azure OpenAI**: Set `OPENAI.API_TYPE: "azure"`, `OPENAI.API_BASE`, and `OPENAI.DEPLOYMENT_ID` - **Local Models**: Set `config.model: "ollama/model-name"` and `OLLAMA.API_BASE` **Environment Variable Format:** - Use dots (`.`) to separate sections and keys: `config.model`, `pr_reviewer.extra_instructions` - Boolean values as strings: `"true"` or `"false"` - Arrays as JSON strings: `'["item1", "item2"]'` For complete model configuration details, see [Changing a model in PR-Agent](changing_a_model.md). ### GitLab Webhook After setting up a GitLab webhook, to control which commands will run automatically when a new MR is opened, you can set the `pr_commands` parameter in the configuration file, similar to the GitHub App: ```toml [gitlab] pr_commands = [ "/describe", "/review", "/improve", ] ``` the GitLab webhook can also respond to new code that is pushed to an open MR. The configuration toggle `handle_push_trigger` can be used to enable this feature. The configuration parameter `push_commands` defines the list of tools that will be **run automatically** when new code is pushed to the MR. ```toml [gitlab] handle_push_trigger = true push_commands = [ "/describe", "/review", ] ``` Note that to use the 'handle_push_trigger' feature, you need to give the gitlab webhook also the "Push events" scope. ### BitBucket App Similar to GitHub app, when running PR-Agent from BitBucket App, the default [configuration file](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) will be initially loaded. By uploading a local `.pr_agent.toml` file to the root of the repo's default branch, you can edit and customize any configuration parameter. Note that you need to upload `.pr_agent.toml` prior to creating a PR, in order for the configuration to take effect. For example, if your local `.pr_agent.toml` file contains: ```toml [pr_reviewer] extra_instructions = "Answer in japanese" ``` Each time you invoke a `/review` tool, it will use the extra instructions you set in the local configuration file. Note that among other limitations, BitBucket provides relatively low rate-limits for applications (up to 1000 requests per hour), and does not provide an API to track the actual rate-limit usage. If you experience a lack of responses from PR-Agent, you might want to set: `bitbucket_app.avoid_full_files=true` in your configuration file. This will prevent PR-Agent from acquiring the full file content, and will only use the diff content. This will reduce the number of requests made to BitBucket, at the cost of small decrease in accuracy, as dynamic context will not be applicable. #### BitBucket Self-Hosted App automatic tools To control which commands will run automatically when a new PR is opened, you can set the `pr_commands` parameter in the configuration file: Specifically, set the following values: ```toml [bitbucket_app] pr_commands = [ "/review", "/improve --pr_code_suggestions.commitable_code_suggestions=true --pr_code_suggestions.suggestions_score_threshold=7", ] ``` Note that we set specifically for bitbucket, we recommend using: `--pr_code_suggestions.suggestions_score_threshold=7` and that is the default value we set for bitbucket. Since this platform only supports inline code suggestions, we want to limit the number of suggestions, and only present a limited number. To enable BitBucket app to respond to each **push** to the PR, set (for example): ```toml [bitbucket_app] handle_push_trigger = true push_commands = [ "/describe", "/review", ] ``` ### Azure DevOps provider To use Azure DevOps provider use the following settings in configuration.toml: ```toml [config] git_provider="azure" ``` Azure DevOps provider supports [PAT token](https://learn.microsoft.com/en-us/azure/devops/organizations/accounts/use-personal-access-tokens-to-authenticate?view=azure-devops&tabs=Windows) or [DefaultAzureCredential](https://learn.microsoft.com/en-us/azure/developer/python/sdk/authentication-overview#authentication-in-server-environments) authentication. PAT is faster to create, but has build in expiration date, and will use the user identity for API calls. Using DefaultAzureCredential you can use managed identity or Service principle, which are more secure and will create separate ADO user identity (via AAD) to the agent. If PAT was chosen, you can assign the value in .secrets.toml. If DefaultAzureCredential was chosen, you can assigned the additional env vars like AZURE_CLIENT_SECRET directly, or use managed identity/az cli (for local development) without any additional configuration. in any case, 'org' value must be assigned in .secrets.toml: ``` [azure_devops] org = "https://dev.azure.com/YOUR_ORGANIZATION/" # pat = "YOUR_PAT_TOKEN" needed only if using PAT for authentication ``` #### Azure DevOps Webhook To control which commands will run automatically when a new PR is opened, you can set the `pr_commands` parameter in the configuration file, similar to the GitHub App: ```toml [azure_devops_server] pr_commands = [ "/describe", "/review", "/improve", ] ``` ### Gitea Webhook After setting up a Gitea webhook, to control which commands will run automatically when a new MR is opened, you can set the `pr_commands` parameter in the configuration file, similar to the GitHub App: ```toml [gitea] pr_commands = [ "/describe", "/review", "/improve", ] ``` ================================================ FILE: docs/docs/usage-guide/changing_a_model.md ================================================ ## Changing a model in PR-Agent See [here](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/algo/__init__.py) for a list of supported models in PR-Agent. The default model of PR-Agent is `GPT-5` from OpenAI. To use a different model than the default, you need to edit in the [configuration file](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L7) the fields: ```toml [config] model = "..." fallback_models = ["..."] ``` For models and environments not from OpenAI, you might need to provide additional keys and other parameters. You can give parameters via a configuration file, or from environment variables. !!! note "Model-specific environment variables" See [litellm documentation](https://litellm.vercel.app/docs/proxy/quick_start#supported-llms) for the environment variables needed per model, as they may vary and change over time. Our documentation per-model may not always be up-to-date with the latest changes. Failing to set the needed keys of a specific model will usually result in litellm not identifying the model type, and failing to utilize it. ### OpenAI like API To use an OpenAI like API, set the following in your `.secrets.toml` file: ```toml [openai] api_base = "https://api.openai.com/v1" api_key = "sk-..." ``` or use the environment variables (make sure to use double underscores `__`): ```bash OPENAI__API_BASE=https://api.openai.com/v1 OPENAI__KEY=sk-... ``` ### OpenAI Flex Processing To reduce costs for non-urgent/background tasks, enable Flex Processing: ```toml [litellm] extra_body='{"processing_mode": "flex"}' ``` See [OpenAI Flex Processing docs](https://platform.openai.com/docs/guides/flex-processing) for details. ### Azure To use Azure, set in your `.secrets.toml` (working from CLI), or in the GitHub `Settings > Secrets and variables` (working from GitHub App or GitHub Action): ```toml [openai] key = "" # your azure api key api_type = "azure" api_version = '2023-05-15' # Check Azure documentation for the current API version api_base = "" # The base URL for your Azure OpenAI resource. e.g. "https://.openai.azure.com" deployment_id = "" # The deployment name you chose when you deployed the engine ``` and set in your configuration file: ```toml [config] model="" # the OpenAI model you've deployed on Azure (e.g. gpt-4o) fallback_models=["..."] ``` To use Azure AD (Entra id) based authentication set in your `.secrets.toml` (working from CLI), or in the GitHub `Settings > Secrets and variables` (working from GitHub App or GitHub Action): ```toml [azure_ad] client_id = "" # Your Azure AD application client ID client_secret = "" # Your Azure AD application client secret tenant_id = "" # Your Azure AD tenant ID api_base = "" # Your Azure OpenAI service base URL (e.g., https://openai.xyz.com/) ``` Passing custom headers to the underlying LLM Model API can be done by setting extra_headers parameter to litellm. ```toml [litellm] extra_headers='{"projectId": "", ...}') #The value of this setting should be a JSON string representing the desired headers, a ValueError is thrown otherwise. ``` This enables users to pass authorization tokens or API keys, when routing requests through an API management gateway. ### Ollama You can run models locally through either [VLLM](https://docs.litellm.ai/docs/providers/vllm) or [Ollama](https://docs.litellm.ai/docs/providers/ollama) E.g. to use a new model locally via Ollama, set in `.secrets.toml` or in a configuration file: ```toml [config] model = "ollama/qwen2.5-coder:32b" fallback_models=["ollama/qwen2.5-coder:32b"] custom_model_max_tokens=128000 # set the maximal input tokens for the model duplicate_examples=true # will duplicate the examples in the prompt, to help the model to generate structured output [ollama] api_base = "http://localhost:11434" # or whatever port you're running Ollama on ``` By default, Ollama uses a context window size of 2048 tokens. In most cases this is not enough to cover pr-agent prompt and pull-request diff. Context window size can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context length to 8K, use: `OLLAMA_CONTEXT_LENGTH=8192 ollama serve`. More information you can find on the [official ollama faq](https://github.com/ollama/ollama/blob/main/docs/faq.md#how-can-i-specify-the-context-window-size). Please note that the `custom_model_max_tokens` setting should be configured in accordance with the `OLLAMA_CONTEXT_LENGTH`. Failure to do so may result in unexpected model output. !!! note "Local models vs commercial models" PR-Agent is compatible with almost any AI model, but analyzing complex code repositories and pull requests requires a model specifically optimized for code analysis. Commercial models such as GPT-5, Claude Sonnet, and Gemini have demonstrated robust capabilities in generating structured output for code analysis tasks with large input. In contrast, most open-source models currently available (as of January 2025) face challenges with these complex tasks. Based on our testing, local open-source models are suitable for experimentation and learning purposes (mainly for the `ask` command), but they are not suitable for production-level code analysis tasks. Hence, for production workflows and real-world usage, we recommend using commercial models. ### Hugging Face To use a new model with Hugging Face Inference Endpoints, for example, set: ```toml [config] # in configuration.toml model = "huggingface/meta-llama/Llama-2-7b-chat-hf" fallback_models=["huggingface/meta-llama/Llama-2-7b-chat-hf"] custom_model_max_tokens=... # set the maximal input tokens for the model [huggingface] # in .secrets.toml key = ... # your Hugging Face api key api_base = ... # the base url for your Hugging Face inference endpoint ``` (you can obtain a Llama2 key from [here](https://replicate.com/replicate/llama-2-70b-chat/api)) ### Replicate To use Llama2 model with Replicate, for example, set: ```toml [config] # in configuration.toml model = "replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1" fallback_models=["replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1"] [replicate] # in .secrets.toml key = ... ``` (you can obtain a Llama2 key from [here](https://replicate.com/replicate/llama-2-70b-chat/api)) Also, review the [.secrets_template.toml](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/.secrets_template.toml) file for instructions on how to set keys for other models. ### Groq To use Llama3 model with Groq, for example, set: ```toml [config] # in configuration.toml model = "llama3-70b-8192" fallback_models = ["groq/llama3-70b-8192"] [groq] # in .secrets.toml key = ... # your Groq api key ``` (you can obtain a Groq key from [here](https://console.groq.com/keys)) ### xAI To use xAI's models with PR-Agent, set: ```toml [config] # in configuration.toml model = "xai/grok-2-latest" fallback_models = ["xai/grok-2-latest"] # or any other model as fallback [xai] # in .secrets.toml key = "..." # your xAI API key ``` You can obtain an xAI API key from [xAI's console](https://console.x.ai/) by creating an account and navigating to the developer settings page. ### Vertex AI To use Google's Vertex AI platform and its associated models (chat-bison/codechat-bison) set: ```toml [config] # in configuration.toml model = "vertex_ai/codechat-bison" fallback_models="vertex_ai/codechat-bison" [vertexai] # in .secrets.toml vertex_project = "my-google-cloud-project" vertex_location = "" ``` Your [application default credentials](https://cloud.google.com/docs/authentication/application-default-credentials) will be used for authentication so there is no need to set explicit credentials in most environments. If you do want to set explicit credentials, then you can use the `GOOGLE_APPLICATION_CREDENTIALS` environment variable set to a path to a json credentials file. ### Google AI Studio To use [Google AI Studio](https://aistudio.google.com/) models, set the relevant models in the configuration section of the configuration file: ```toml [config] # in configuration.toml model="gemini/gemini-1.5-flash" fallback_models=["gemini/gemini-1.5-flash"] [google_ai_studio] # in .secrets.toml gemini_api_key = "..." ``` If you don't want to set the API key in the .secrets.toml file, you can set the `GOOGLE_AI_STUDIO.GEMINI_API_KEY` environment variable. ### Anthropic To use Anthropic models, set the relevant models in the configuration section of the configuration file: ```toml [config] model="anthropic/claude-3-opus-20240229" fallback_models=["anthropic/claude-3-opus-20240229"] ``` And also set the api key in the .secrets.toml file: ```toml [anthropic] KEY = "..." ``` See [litellm](https://docs.litellm.ai/docs/providers/anthropic#usage) documentation for more information about the environment variables required for Anthropic. ### Amazon Bedrock To use Amazon Bedrock and its foundational models, add the below configuration: ```toml [config] # in configuration.toml model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0" fallback_models=["bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"] [aws] AWS_ACCESS_KEY_ID="..." AWS_SECRET_ACCESS_KEY="..." AWS_REGION_NAME="..." ``` You can also use the new Meta Llama 4 models available on Amazon Bedrock: ```toml [config] # in configuration.toml model="bedrock/us.meta.llama4-scout-17b-instruct-v1:0" fallback_models=["bedrock/us.meta.llama4-maverick-17b-instruct-v1:0"] ``` #### Custom Inference Profiles To use a custom inference profile with Amazon Bedrock (for cost allocation tags and other configuration settings), add the `model_id` parameter to your configuration: ```toml [config] # in configuration.toml model="bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0" fallback_models=["bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0"] [aws] AWS_ACCESS_KEY_ID="..." AWS_SECRET_ACCESS_KEY="..." AWS_REGION_NAME="..." [litellm] model_id = "your-custom-inference-profile-id" ``` The `model_id` parameter will be passed to all Bedrock completion calls, allowing you to use custom inference profiles for better cost allocation and reporting. See [litellm](https://docs.litellm.ai/docs/providers/bedrock#usage) documentation for more information about the environment variables required for Amazon Bedrock. ### DeepSeek To use deepseek-chat model with DeepSeek, for example, set: ```toml [config] # in configuration.toml model = "deepseek/deepseek-chat" fallback_models=["deepseek/deepseek-chat"] ``` and fill up your key ```toml [deepseek] # in .secrets.toml key = ... ``` (you can obtain a deepseek-chat key from [here](https://platform.deepseek.com)) ### DeepInfra To use DeepSeek model with DeepInfra, for example, set: ```toml [config] # in configuration.toml model = "deepinfra/deepseek-ai/DeepSeek-R1-Distill-Llama-70B" fallback_models = ["deepinfra/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"] [deepinfra] # in .secrets.toml key = ... # your DeepInfra api key ``` (you can obtain a DeepInfra key from [here](https://deepinfra.com/dash/api_keys)) ### Mistral To use models like Mistral or Codestral with Mistral, for example, set: ```toml [config] # in configuration.toml model = "mistral/mistral-small-latest" fallback_models = ["mistral/mistral-medium-latest"] [mistral] # in .secrets.toml key = "..." # your Mistral api key ``` (you can obtain a Mistral key from [here](https://console.mistral.ai/api-keys)) ### Codestral To use Codestral model with Codestral, for example, set: ```toml [config] # in configuration.toml model = "codestral/codestral-latest" fallback_models = ["codestral/codestral-2405"] [codestral] # in .secrets.toml key = "..." # your Codestral api key ``` (you can obtain a Codestral key from [here](https://console.mistral.ai/codestral)) ### Openrouter To use model from Openrouter, for example, set: ```toml [config] # in configuration.toml model="openrouter/anthropic/claude-3.7-sonnet" fallback_models=["openrouter/deepseek/deepseek-chat"] custom_model_max_tokens=20000 [openrouter] # in .secrets.toml or passed an environment variable openrouter__key key = "..." # your openrouter api key ``` (you can obtain an Openrouter API key from [here](https://openrouter.ai/settings/keys)) ### Custom models If the relevant model doesn't appear [here](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/algo/__init__.py), you can still use it as a custom model: 1. Set the model name in the configuration file: ```toml [config] model="custom_model_name" fallback_models=["custom_model_name"] ``` 2. Set the maximal tokens for the model: ```toml [config] custom_model_max_tokens= ... ``` 3. Go to [litellm documentation](https://litellm.vercel.app/docs/proxy/quick_start#supported-llms), find the model you want to use, and set the relevant environment variables. 4. Most reasoning models do not support chat-style inputs (`system` and `user` messages) or temperature settings. To bypass chat templates and temperature controls, set `config.custom_reasoning_model = true` in your configuration file. ## Dedicated parameters ### OpenAI models ```toml [config] reasoning_effort = "medium" # "low", "medium", "high" ``` With the OpenAI models that support reasoning effort (eg: o4-mini), you can specify its reasoning effort via `config` section. The default value is `medium`. You can change it to `high` or `low` based on your usage. ### Anthropic models ```toml [config] enable_claude_extended_thinking = false # Set to true to enable extended thinking feature extended_thinking_budget_tokens = 2048 extended_thinking_max_output_tokens = 4096 ``` ================================================ FILE: docs/docs/usage-guide/configuration_options.md ================================================ The different tools and sub-tools used by PR-Agent are adjustable via a Git configuration file. There are three main ways to set persistent configurations: 1. [Wiki](./configuration_options.md#wiki-configuration-file) configuration page 2. [Local](./configuration_options.md#local-configuration-file) configuration file 3. [Global](./configuration_options.md#global-configuration-file) configuration file In terms of precedence, wiki configurations will override local configurations, and local configurations will override global configurations. For a list of all possible configurations, see the [configuration options](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) page. In addition to general configuration options, each tool has its own configurations. For example, the `review` tool will use parameters from the [pr_reviewer](https://github.com/qodo-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L66) section in the configuration file. !!! tip "Tip1: Edit only what you need" Your configuration file should be minimal, and edit only the relevant values. Don't copy the entire configuration options, since it can lead to legacy problems when something changes. !!! tip "Tip2: Show relevant configurations" If you set `config.output_relevant_configurations` to True, each tool will also output in a collapsible section its relevant configurations. This can be useful for debugging, or getting to know the configurations better. ## Wiki configuration file `Platforms supported: GitHub, GitLab, Bitbucket` With PR-Agent, you can set configurations by creating a page called `.pr_agent.toml` in the [wiki](https://github.com/qodo-ai/pr-agent/wiki/pr_agent.toml) of the repo. The advantage of this method is that it allows to set configurations without needing to commit new content to the repo - just edit the wiki page and **save**. ![wiki_configuration](https://codium.ai/images/pr_agent/wiki_configuration.png){width=512} Click [here](https://codium.ai/images/pr_agent/wiki_configuration_pr_agent.mp4) to see a short instructional video. We recommend surrounding the configuration content with triple-quotes (or \`\`\`toml), to allow better presentation when displayed in the wiki as markdown. An example content: ```toml [pr_description] generate_ai_title=true ``` PR-Agent will know to remove the surrounding quotes when reading the configuration content. ## Local configuration file `Platforms supported: GitHub, GitLab, Bitbucket, Azure DevOps` By uploading a local `.pr_agent.toml` file to the root of the repo's default branch, you can edit and customize any configuration parameter. Note that you need to upload or update `.pr_agent.toml` before using the PR Agent tools (either at PR creation or via manual trigger) for the configuration to take effect. For example, if you set in `.pr_agent.toml`: ``` [pr_reviewer] extra_instructions="""\ - instruction a - instruction b ... """ ``` Then you can give a list of extra instructions to the `review` tool. ## Global configuration file `Platforms supported: GitHub, GitLab (cloud), Bitbucket (cloud)` If you create a repo called `pr-agent-settings` in your **organization**, its configuration file `.pr_agent.toml` will be used as a global configuration file for any other repo that belongs to the same organization. Parameters from a local `.pr_agent.toml` file, in a specific repo, will override the global configuration parameters. For example, in the GitHub organization `qodo-ai`: - The file [`https://github.com/qodo-ai/pr-agent-settings/.pr_agent.toml`](https://github.com/qodo-ai/pr-agent-settings/blob/main/.pr_agent.toml) serves as a global configuration file for all the repos in the GitHub organization `qodo-ai`. - The repo [`https://github.com/qodo-ai/pr-agent`](https://github.com/qodo-ai/pr-agent/blob/main/.pr_agent.toml) inherits the global configuration file from `pr-agent-settings`. ## Project/Group level configuration file `Platforms supported: GitLab, Bitbucket Data Center` Create a repository named `pr-agent-settings` within a specific project (Bitbucket) or a group/subgroup (Gitlab). The configuration file in this repository will apply to all repositories directly under the same project/group/subgroup. !!! note "Note" For Gitlab, in case of a repository nested in several sub groups, the lookup for a pr-agent-settings repo will be only on one level above such repository. ## Organization level configuration file `Relevant platforms: Bitbucket Data Center` Create a dedicated project to hold a global configuration file that affects all repositories across all projects in your organization. **Setting up organization-level global configuration:** 1. Create a new project with both the name and key: PR_AGENT_SETTINGS. 2. Inside the PR_AGENT_SETTINGS project, create a repository named pr-agent-settings. 3. In this repository, add a `.pr_agent.toml` configuration file—structured similarly to the global configuration file described above. 4. Optionally, you can add organizational-level [global best practices](../tools/improve.md#global-hierarchical-best-practices). Repositories across your entire Bitbucket organization will inherit the configuration from this file. !!! note "Note" If both organization-level and project-level global settings are defined, the project-level settings will take precedence over the organization-level configuration. Additionally, parameters from a repository’s local .pr_agent.toml file will always override both global settings. ================================================ FILE: docs/docs/usage-guide/index.md ================================================ # Usage guide This section provides a detailed guide on how to use PR-Agent. It includes information on how to adjust PR-Agent configurations, define which tools will run automatically, and other advanced configurations. - [Introduction](./introduction.md) - [Configuration File](./configuration_options.md) - [Usage and Automation](./automations_and_usage.md) - [Local Repo (CLI)](./automations_and_usage.md#local-repo-cli) - [Online Usage](./automations_and_usage.md#online-usage) - [GitHub App](./automations_and_usage.md#github-app) - [GitHub Action](./automations_and_usage.md#github-action) - [GitLab Webhook](./automations_and_usage.md#gitlab-webhook) - [Gitea Webhook](./automations_and_usage.md#gitea-webhook) - [BitBucket App](./automations_and_usage.md#bitbucket-app) - [Azure DevOps Provider](./automations_and_usage.md#azure-devops-provider) - [Managing Mail Notifications](./mail_notifications.md) - [Changing a Model](./changing_a_model.md) - [Additional Configurations](./additional_configurations.md) - [Ignoring files from analysis](./additional_configurations.md#ignoring-files-from-analysis) - [Extra instructions](./additional_configurations.md#extra-instructions) - [Working with large PRs](./additional_configurations.md#working-with-large-prs) - [Changing a model](./changing_a_model.md) - [FAQ](../faq/index.md) ================================================ FILE: docs/docs/usage-guide/introduction.md ================================================ After [installation](../installation/index.md), there are three basic ways to invoke PR-Agent: 1. Locally running a CLI command 2. Online usage - by [commenting](https://github.com/qodo-ai/pr-agent/pull/229#issuecomment-1695021901){:target="_blank"} on a PR 3. Enabling PR-Agent tools to run automatically when a new PR is opened Specifically, CLI commands can be issued by invoking a pre-built [docker image](../installation/locally.md#using-docker-image), or by invoking a [locally cloned repo](../installation/locally.md#run-from-source). For online usage, you will need to setup either a [GitHub App](../installation/github.md#run-as-a-github-app) or a [GitHub Action](../installation/github.md#run-as-a-github-action) (GitHub), a [GitLab webhook](../installation/gitlab.md#run-a-gitlab-webhook-server) (GitLab), or a [BitBucket App](../installation/bitbucket.md#run-using-codiumai-hosted-bitbucket-app) (BitBucket). These platforms also enable to run PR-Agent specific tools automatically when a new PR is opened, or on each push to a branch. ================================================ FILE: docs/docs/usage-guide/mail_notifications.md ================================================ Unfortunately, it is not possible in GitHub to disable mail notifications from a specific user. If you are subscribed to notifications for a repo with PR-Agent, we recommend turning off notifications for PR comments, to avoid lengthy emails: ![notifications](https://codium.ai/images/pr_agent/notifications.png){width=512} As an alternative, you can filter in your mail provider the notifications specifically from the PR-Agent bot, [see how](https://www.quora.com/How-can-you-filter-emails-for-specific-people-in-Gmail#:~:text=On%20the%20Filters%20and%20Blocked,the%20body%20of%20the%20email). ![filter_mail_notifications](https://codium.ai/images/pr_agent/filter_mail_notifications.png){width=512} Another option to reduce the mail overload, yet still receive notifications on PR-Agent tools, is to disable the help collapsible section in PR-Agent bot comments. This can done by setting `enable_help_text=false` for the relevant tool in the configuration file. For example, to disable the help text for the `pr_reviewer` tool, set: ``` [pr_reviewer] enable_help_text = false ``` ================================================ FILE: docs/mkdocs.yml ================================================ site_name: PR-Agent repo_url: https://github.com/qodo-ai/pr-agent repo_name: Qodo-ai/pr-agent nav: - Overview: - 'index.md' - Data Privacy: 'overview/data_privacy.md' - Installation: - 'installation/index.md' - PR-Agent: 'installation/pr_agent.md' - Usage Guide: - 'usage-guide/index.md' - Introduction: 'usage-guide/introduction.md' - Configuration File: 'usage-guide/configuration_options.md' - Usage and Automation: 'usage-guide/automations_and_usage.md' - Managing Mail Notifications: 'usage-guide/mail_notifications.md' - Changing a Model: 'usage-guide/changing_a_model.md' - Additional Configurations: 'usage-guide/additional_configurations.md' - Frequently Asked Questions: 'faq/index.md' - Tools: - 'tools/index.md' - Describe: 'tools/describe.md' - Review: 'tools/review.md' - Improve: 'tools/improve.md' - Ask: 'tools/ask.md' - Add Docs: 'tools/add_docs.md' - Generate Labels: 'tools/generate_labels.md' - Similar Issues: 'tools/similar_issues.md' - Help: 'tools/help.md' - Help Docs: 'tools/help_docs.md' - Update Changelog: 'tools/update_changelog.md' - Core Abilities: - 'core-abilities/index.md' - Compression strategy: 'core-abilities/compression_strategy.md' - Dynamic context: 'core-abilities/dynamic_context.md' - Fetching ticket context: 'core-abilities/fetching_ticket_context.md' - Interactivity: 'core-abilities/interactivity.md' - Local and global metadata: 'core-abilities/metadata.md' - Self-reflection: 'core-abilities/self_reflection.md' # - Code Fine-tuning Benchmark: 'finetuning_benchmark/index.md' theme: logo: assets/favicon.svg favicon: assets/favicon.svg name: material icon: repo: fontawesome/brands/github features: - navigation.tabs - navigation.expand - navigation.path - navigation.top - navigation.tracking - navigation.indexes - search.suggest - search.highlight - content.tabs.link - content.code.annotation - content.code.copy - announce.dismiss language: en custom_dir: overrides palette: - media: "(prefers-color-scheme)" toggle: icon: material/brightness-auto name: Switch to light mode - media: "(prefers-color-scheme: light)" scheme: default toggle: icon: material/toggle-switch-off-outline name: Switch to dark mode primary: custom accent: custom - media: "(prefers-color-scheme: dark)" scheme: slate toggle: icon: material/toggle-switch name: Switch to light mode primary: custom accent: custom plugins: - social - search - glightbox extra: generator: false social: - icon: fontawesome/brands/github link: https://github.com/qodo-ai/pr-agent extra_css: - css/custom.css markdown_extensions: - pymdownx.highlight: anchor_linenums: true - pymdownx.inlinehilite - pymdownx.snippets - admonition - pymdownx.arithmatex: generic: true - footnotes - pymdownx.details - pymdownx.superfences - pymdownx.mark - md_in_html - attr_list - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg - pymdownx.tabbed: alternate_style: true - toc: title: On this page toc_depth: 3 permalink: true copyright: | © 2026 PR-Agent Contributors ================================================ FILE: docs/overrides/main.html ================================================ {% extends "base.html" %} {% block announce %} Open source PR Agent documentation. For the Qodo free version, Get Started: https://www.qodo.ai/get-started/ {% endblock %} {% block scripts %} {{ super() }} {% endblock %} ================================================ FILE: docs/overrides/partials/footer.html ================================================ Footer
================================================ FILE: docs/overrides/partials/integrations/analytics/custom.html ================================================ ================================================ FILE: github_action/entrypoint.sh ================================================ #!/bin/bash python /app/pr_agent/servers/github_action_runner.py ================================================ FILE: pr_agent/__init__.py ================================================ ================================================ FILE: pr_agent/agent/__init__.py ================================================ ================================================ FILE: pr_agent/agent/pr_agent.py ================================================ import shlex from functools import partial from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.cli_args import CliArgs from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings from pr_agent.git_providers.utils import apply_repo_settings from pr_agent.log import get_logger from pr_agent.tools.pr_add_docs import PRAddDocs from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions from pr_agent.tools.pr_config import PRConfig from pr_agent.tools.pr_description import PRDescription from pr_agent.tools.pr_generate_labels import PRGenerateLabels from pr_agent.tools.pr_help_docs import PRHelpDocs from pr_agent.tools.pr_help_message import PRHelpMessage from pr_agent.tools.pr_line_questions import PR_LineQuestions from pr_agent.tools.pr_questions import PRQuestions from pr_agent.tools.pr_reviewer import PRReviewer from pr_agent.tools.pr_similar_issue import PRSimilarIssue from pr_agent.tools.pr_update_changelog import PRUpdateChangelog command2class = { "auto_review": PRReviewer, "answer": PRReviewer, "review": PRReviewer, "review_pr": PRReviewer, "describe": PRDescription, "describe_pr": PRDescription, "improve": PRCodeSuggestions, "improve_code": PRCodeSuggestions, "ask": PRQuestions, "ask_question": PRQuestions, "ask_line": PR_LineQuestions, "update_changelog": PRUpdateChangelog, "config": PRConfig, "settings": PRConfig, "help": PRHelpMessage, "similar_issue": PRSimilarIssue, "add_docs": PRAddDocs, "generate_labels": PRGenerateLabels, "help_docs": PRHelpDocs, } commands = list(command2class.keys()) class PRAgent: def __init__(self, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): self.ai_handler = ai_handler # will be initialized in run_action async def _handle_request(self, pr_url, request, notify=None) -> bool: # First, apply repo specific settings if exists apply_repo_settings(pr_url) # Then, apply user specific settings if exists if isinstance(request, str): request = request.replace("'", "\\'") lexer = shlex.shlex(request, posix=True) lexer.whitespace_split = True action, *args = list(lexer) else: action, *args = request # validate args is_valid, arg = CliArgs.validate_user_args(args) if not is_valid: get_logger().error( f"CLI argument for param '{arg}' is forbidden. Use instead a configuration file." ) return False # Update settings from args args = update_settings_from_args(args) # Append the response language in the extra instructions response_language = get_settings().config.get('response_language', 'en-us') if response_language.lower() != 'en-us': get_logger().info(f'User has set the response language to: {response_language}') for key in get_settings(): setting = get_settings().get(key) if str(type(setting)) == "": if hasattr(setting, 'extra_instructions'): current_extra_instructions = setting.extra_instructions # Define the language-specific instruction and the separator lang_instruction_text = f"Your response MUST be written in the language corresponding to locale code: '{response_language}'. This is crucial." separator_text = "\n======\n\nIn addition, " # Check if the specific language instruction is already present to avoid duplication if lang_instruction_text not in str(current_extra_instructions): if current_extra_instructions: # If there's existing text setting.extra_instructions = str(current_extra_instructions) + separator_text + lang_instruction_text else: # If extra_instructions was None or empty setting.extra_instructions = lang_instruction_text # If lang_instruction_text is already present, do nothing. action = action.lstrip("/").lower() if action not in command2class: get_logger().warning(f"Unknown command: {action}") return False with get_logger().contextualize(command=action, pr_url=pr_url): get_logger().info("PR-Agent request handler started", analytics=True) if action == "answer": if notify: notify() await PRReviewer(pr_url, is_answer=True, args=args, ai_handler=self.ai_handler).run() elif action == "auto_review": await PRReviewer(pr_url, is_auto=True, args=args, ai_handler=self.ai_handler).run() elif action in command2class: if notify: notify() await command2class[action](pr_url, ai_handler=self.ai_handler, args=args).run() else: return False return True async def handle_request(self, pr_url, request, notify=None) -> bool: try: return await self._handle_request(pr_url, request, notify) except: get_logger().exception("Failed to process the command.") return False ================================================ FILE: pr_agent/algo/__init__.py ================================================ MAX_TOKENS = { 'text-embedding-ada-002': 8000, 'gpt-3.5-turbo': 16000, 'gpt-3.5-turbo-0125': 16000, 'gpt-3.5-turbo-0613': 4000, 'gpt-3.5-turbo-1106': 16000, 'gpt-3.5-turbo-16k': 16000, 'gpt-3.5-turbo-16k-0613': 16000, 'gpt-4': 8000, 'gpt-4-0613': 8000, 'gpt-4-32k': 32000, 'gpt-4-1106-preview': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4-0125-preview': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4o': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4o-2024-05-13': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4-turbo-preview': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4-turbo-2024-04-09': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4-turbo': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4o-mini': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4o-mini-2024-07-18': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4o-2024-08-06': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4o-2024-11-20': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4.5-preview': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4.5-preview-2025-02-27': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-4.1': 1047576, 'gpt-4.1-2025-04-14': 1047576, 'gpt-4.1-mini': 1047576, 'gpt-4.1-mini-2025-04-14': 1047576, 'gpt-4.1-nano': 1047576, 'gpt-4.1-nano-2025-04-14': 1047576, 'gpt-5-nano': 200000, # 200K, but may be limited by config.max_model_tokens 'gpt-5-mini': 200000, # 200K, but may be limited by config.max_model_tokens 'gpt-5': 200000, 'gpt-5-2025-08-07': 200000, 'gpt-5.1': 200000, 'gpt-5.1-2025-11-13': 200000, 'gpt-5.1-chat-latest': 200000, 'gpt-5.1-codex': 200000, 'gpt-5.1-codex-mini': 200000, 'gpt-5.2': 400000, # 400K, but may be limited by config.max_model_tokens 'gpt-5.2-2025-12-11': 400000, # 400K, but may be limited by config.max_model_tokens 'gpt-5.2-chat-latest': 128000, # 128K, but may be limited by config.max_model_tokens 'gpt-5.2-codex': 400000, # 400K, but may be limited by config.max_model_tokens 'gpt-5.3-codex': 400000, # 400K, but may be limited by config.max_model_tokens 'gpt-5.4': 272000, # 272K safe default without opt-in 1M context parameters 'gpt-5.4-2026-03-05': 272000, # 272K safe default without opt-in 1M context parameters 'o1-mini': 128000, # 128K, but may be limited by config.max_model_tokens 'o1-mini-2024-09-12': 128000, # 128K, but may be limited by config.max_model_tokens 'o1-preview': 128000, # 128K, but may be limited by config.max_model_tokens 'o1-preview-2024-09-12': 128000, # 128K, but may be limited by config.max_model_tokens 'o1-2024-12-17': 204800, # 200K, but may be limited by config.max_model_tokens 'o1': 204800, # 200K, but may be limited by config.max_model_tokens 'o3-mini': 204800, # 200K, but may be limited by config.max_model_tokens 'o3-mini-2025-01-31': 204800, # 200K, but may be limited by config.max_model_tokens 'o3': 200000, # 200K, but may be limited by config.max_model_tokens 'o3-2025-04-16': 200000, # 200K, but may be limited by config.max_model_tokens 'o4-mini': 200000, # 200K, but may be limited by config.max_model_tokens 'o4-mini-2025-04-16': 200000, # 200K, but may be limited by config.max_model_tokens 'claude-instant-1': 100000, 'claude-2': 100000, 'command-nightly': 4096, 'deepseek/deepseek-chat': 128000, # 128K, but may be limited by config.max_model_tokens 'deepseek/deepseek-reasoner': 64000, # 64K, but may be limited by config.max_model_tokens 'openai/qwq-plus': 131072, # 131K context length, but may be limited by config.max_model_tokens 'replicate/llama-2-70b-chat:2c1608e18606fad2812020dc541930f2d0495ce32eee50074220b87300bc16e1': 4096, 'meta-llama/Llama-2-7b-chat-hf': 4096, 'vertex_ai/codechat-bison': 6144, 'vertex_ai/codechat-bison-32k': 32000, 'vertex_ai/claude-3-haiku@20240307': 100000, 'vertex_ai/claude-3-5-haiku@20241022': 100000, 'vertex_ai/claude-haiku-4-5@20251001': 200000, 'vertex_ai/claude-3-sonnet@20240229': 100000, 'vertex_ai/claude-3-opus@20240229': 100000, 'vertex_ai/claude-opus-4@20250514': 200000, 'vertex_ai/claude-opus-4-1@20250805': 200000, 'vertex_ai/claude-opus-4-5@20251101': 200000, 'vertex_ai/claude-opus-4-6@20260120': 200000, 'vertex_ai/claude-opus-4-6': 200000, 'vertex_ai/claude-3-5-sonnet@20240620': 100000, 'vertex_ai/claude-3-5-sonnet-v2@20241022': 100000, 'vertex_ai/claude-3-7-sonnet@20250219': 200000, 'vertex_ai/claude-sonnet-4@20250514': 200000, 'vertex_ai/claude-sonnet-4-5@20250929': 200000, 'vertex_ai/claude-sonnet-4-6': 200000, 'vertex_ai/gemini-1.5-pro': 1048576, 'vertex_ai/gemini-2.5-pro-preview-03-25': 1048576, 'vertex_ai/gemini-2.5-pro-preview-05-06': 1048576, 'vertex_ai/gemini-2.5-pro-preview-06-05': 1048576, 'vertex_ai/gemini-2.5-pro': 1048576, 'vertex_ai/gemini-1.5-flash': 1048576, 'vertex_ai/gemini-2.0-flash': 1048576, 'vertex_ai/gemini-2.5-flash-preview-04-17': 1048576, 'vertex_ai/gemini-2.5-flash-preview-05-20': 1048576, 'vertex_ai/gemini-2.5-flash': 1048576, 'vertex_ai/gemini-3-flash-preview': 1048576, 'vertex_ai/gemini-3-pro-preview': 1048576, 'vertex_ai/gemini-3.1-pro-preview': 1048576, 'vertex_ai/gemma2': 8200, 'gemini/gemini-1.5-pro': 1048576, 'gemini/gemini-1.5-flash': 1048576, 'gemini/gemini-2.0-flash': 1048576, 'gemini/gemini-2.5-flash-preview-04-17': 1048576, 'gemini/gemini-2.5-flash-preview-05-20': 1048576, 'gemini/gemini-2.5-flash': 1048576, 'gemini/gemini-2.5-pro-preview-03-25': 1048576, 'gemini/gemini-2.5-pro-preview-05-06': 1048576, 'gemini/gemini-2.5-pro-preview-06-05': 1048576, 'gemini/gemini-2.5-pro': 1048576, 'gemini/gemini-3-flash-preview': 1048576, 'gemini/gemini-3-pro-preview': 1048576, 'gemini/gemini-3.1-pro-preview': 1048576, 'codechat-bison': 6144, 'codechat-bison-32k': 32000, 'anthropic.claude-instant-v1': 100000, 'anthropic.claude-v1': 100000, 'anthropic.claude-v2': 100000, 'anthropic/claude-3-opus-20240229': 100000, 'anthropic/claude-opus-4-20250514': 200000, 'anthropic/claude-opus-4-1-20250805': 200000, 'anthropic/claude-opus-4-5-20251101': 200000, 'anthropic/claude-opus-4-6': 200000, 'anthropic/claude-opus-4-6-20260120': 200000, 'anthropic/claude-3-5-sonnet-20240620': 100000, 'anthropic/claude-3-5-sonnet-20241022': 100000, 'anthropic/claude-3-7-sonnet-20250219': 200000, 'anthropic/claude-sonnet-4-20250514': 200000, 'anthropic/claude-sonnet-4-5-20250929': 200000, 'anthropic/claude-sonnet-4-6': 200000, 'claude-opus-4-1-20250805': 200000, 'claude-opus-4-5-20251101': 200000, 'claude-opus-4-6': 200000, 'claude-opus-4-6-20260120': 200000, 'claude-3-7-sonnet-20250219': 200000, 'claude-sonnet-4-6': 200000, 'anthropic/claude-3-5-haiku-20241022': 100000, 'anthropic/claude-haiku-4-5-20251001': 200000, 'claude-haiku-4-5-20251001': 200000, 'bedrock/anthropic.claude-instant-v1': 100000, 'bedrock/anthropic.claude-v2': 100000, 'bedrock/anthropic.claude-v2:1': 100000, 'bedrock/anthropic.claude-3-sonnet-20240229-v1:0': 100000, 'bedrock/anthropic.claude-opus-4-20250514-v1:0': 200000, 'bedrock/anthropic.claude-opus-4-1-20250805-v1:0': 200000, 'bedrock/anthropic.claude-opus-4-6-20260120-v1:0': 200000, 'bedrock/anthropic.claude-opus-4-6-v1:0': 200000, 'bedrock/anthropic.claude-3-haiku-20240307-v1:0': 100000, 'bedrock/anthropic.claude-3-5-haiku-20241022-v1:0': 100000, 'bedrock/anthropic.claude-haiku-4-5-20251001-v1:0': 200000, 'bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0': 100000, 'bedrock/anthropic.claude-3-5-sonnet-20241022-v2:0': 100000, 'bedrock/anthropic.claude-3-7-sonnet-20250219-v1:0': 200000, 'bedrock/anthropic.claude-sonnet-4-20250514-v1:0': 200000, 'bedrock/anthropic.claude-sonnet-4-5-20250929-v1:0': 200000, 'bedrock/anthropic.claude-sonnet-4-6': 200000, "bedrock/us.anthropic.claude-opus-4-20250514-v1:0": 200000, "bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0": 200000, "bedrock/us.anthropic.claude-opus-4-6-20260120-v1:0": 200000, "bedrock/global.anthropic.claude-opus-4-5-20251101-v1:0": 200000, "bedrock/us.anthropic.claude-opus-4-5-20251101-v1:0": 200000, "bedrock/global.anthropic.claude-opus-4-6-v1:0": 200000, "bedrock/us.anthropic.claude-opus-4-6-v1:0": 200000, "bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0": 100000, "bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0": 200000, "bedrock/eu.anthropic.claude-haiku-4-5-20251001-v1:0": 200000, "bedrock/au.anthropic.claude-haiku-4-5-20251001-v1:0": 200000, "bedrock/jp.anthropic.claude-haiku-4-5-20251001-v1:0": 200000, "bedrock/apac.anthropic.claude-haiku-4-5-20251001-v1:0": 200000, "bedrock/global.anthropic.claude-haiku-4-5-20251001-v1:0": 200000, "bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0": 200000, "bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0": 200000, "bedrock/global.anthropic.claude-sonnet-4-20250514-v1:0": 200000, "bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0": 200000, "bedrock/au.anthropic.claude-sonnet-4-5-20250929-v1:0": 200000, "bedrock/us.anthropic.claude-sonnet-4-6": 200000, "bedrock/au.anthropic.claude-sonnet-4-6": 200000, "bedrock/apac.anthropic.claude-3-5-sonnet-20241022-v2:0": 100000, "bedrock/apac.anthropic.claude-3-7-sonnet-20250219-v1:0": 200000, "bedrock/apac.anthropic.claude-sonnet-4-20250514-v1:0": 200000, "bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0": 200000, "bedrock/eu.anthropic.claude-sonnet-4-6": 200000, "bedrock/jp.anthropic.claude-sonnet-4-5-20250929-v1:0": 200000, "bedrock/jp.anthropic.claude-sonnet-4-6": 200000, "bedrock/global.anthropic.claude-sonnet-4-5-20250929-v1:0": 200000, "bedrock/global.anthropic.claude-sonnet-4-6": 200000, 'claude-3-5-sonnet': 100000, 'bedrock/us.meta.llama4-scout-17b-instruct-v1:0': 128000, 'bedrock/us.meta.llama4-maverick-17b-instruct-v1:0': 128000, 'groq/openai/gpt-oss-120b': 131072, 'groq/openai/gpt-oss-20b': 131072, 'groq/qwen/qwen3-32b': 131000, 'groq/moonshotai/kimi-k2-instruct': 131072, 'groq/deepseek-r1-distill-llama-70b': 128000, 'groq/meta-llama/llama-4-maverick-17b-128e-instruct': 131072, 'groq/meta-llama/llama-4-scout-17b-16e-instruct': 131072, 'groq/llama-3.3-70b-versatile': 128000, 'groq/llama-3.1-8b-instant': 128000, 'xai/grok-2': 131072, 'xai/grok-2-1212': 131072, 'xai/grok-2-latest': 131072, 'xai/grok-3': 131072, 'xai/grok-3-beta': 131072, 'xai/grok-3-fast': 131072, 'xai/grok-3-fast-beta': 131072, 'xai/grok-3-mini': 131072, 'xai/grok-3-mini-beta': 131072, 'xai/grok-3-mini-fast': 131072, 'xai/grok-3-mini-fast-beta': 131072, 'ollama/llama3': 4096, 'watsonx/meta-llama/llama-3-8b-instruct': 4096, "watsonx/meta-llama/llama-3-70b-instruct": 4096, "watsonx/meta-llama/llama-3-405b-instruct": 16384, "watsonx/ibm/granite-13b-chat-v2": 8191, "watsonx/ibm/granite-34b-code-instruct": 8191, "watsonx/mistralai/mistral-large": 32768, "deepinfra/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B": 128000, "deepinfra/deepseek-ai/DeepSeek-R1-Distill-Llama-70B": 128000, "deepinfra/deepseek-ai/DeepSeek-R1": 128000, "mistral/mistral-small-latest": 8191, "mistral/mistral-medium-latest": 8191, "mistral/mistral-large-2407": 128000, "mistral/mistral-large-latest": 128000, "mistral/open-mistral-7b": 8191, "mistral/open-mixtral-8x7b": 8191, "mistral/open-mixtral-8x22b": 8191, "mistral/codestral-latest": 8191, "mistral/open-mistral-nemo": 128000, "mistral/open-mistral-nemo-2407": 128000, "mistral/open-codestral-mamba": 256000, "mistral/codestral-mamba-latest": 256000, "codestral/codestral-latest": 8191, "codestral/codestral-2405": 8191, } USER_MESSAGE_ONLY_MODELS = [ "deepseek/deepseek-reasoner", "o1-mini", "o1-mini-2024-09-12", "o1-preview" ] NO_SUPPORT_TEMPERATURE_MODELS = [ "deepseek/deepseek-reasoner", "o1-mini", "o1-mini-2024-09-12", "o1", "o1-2024-12-17", "o3-mini", "o3-mini-2025-01-31", "o1-preview", "o3", "o3-2025-04-16", "o4-mini", "o4-mini-2025-04-16", "gpt-5.1-codex", "gpt-5.1-codex-mini", "gpt-5.2-codex", "gpt-5.3-codex", "gpt-5-mini" ] SUPPORT_REASONING_EFFORT_MODELS = [ "o3-mini", "o3-mini-2025-01-31", "o3", "o3-2025-04-16", "o4-mini", "o4-mini-2025-04-16", ] CLAUDE_EXTENDED_THINKING_MODELS = [ "anthropic/claude-3-7-sonnet-20250219", "claude-3-7-sonnet-20250219" ] # Models that require streaming mode STREAMING_REQUIRED_MODELS = [ "openai/qwq-plus" ] ================================================ FILE: pr_agent/algo/ai_handlers/base_ai_handler.py ================================================ from abc import ABC, abstractmethod class BaseAiHandler(ABC): """ This class defines the interface for an AI handler to be used by the PR Agents. """ @abstractmethod def __init__(self): pass @property @abstractmethod def deployment_id(self): pass @abstractmethod async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2, img_path: str = None): """ This method should be implemented to return a chat completion from the AI model. Args: model (str): the name of the model to use for the chat completion system (str): the system message string to use for the chat completion user (str): the user message string to use for the chat completion temperature (float): the temperature to use for the chat completion """ pass ================================================ FILE: pr_agent/algo/ai_handlers/langchain_ai_handler.py ================================================ _LANGCHAIN_INSTALLED = False try: from langchain_core.messages import HumanMessage, SystemMessage from langchain_openai import AzureChatOpenAI, ChatOpenAI _LANGCHAIN_INSTALLED = True except: # we don't enforce langchain as a dependency, so if it's not installed, just move on pass import functools import openai from tenacity import retry, retry_if_exception_type, retry_if_not_exception_type, stop_after_attempt from langchain_core.runnables import Runnable from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.config_loader import get_settings from pr_agent.log import get_logger OPENAI_RETRIES = 5 class LangChainOpenAIHandler(BaseAiHandler): def __init__(self): if not _LANGCHAIN_INSTALLED: error_msg = "LangChain is not installed. Please install it with `pip install langchain`." get_logger().error(error_msg) raise ImportError(error_msg) super().__init__() self.azure = get_settings().get("OPENAI.API_TYPE", "").lower() == "azure" @property def deployment_id(self): """ Returns the deployment ID for the OpenAI API. """ return get_settings().get("OPENAI.DEPLOYMENT_ID", None) async def _create_chat_async(self, deployment_id=None): try: if self.azure: # Using Azure OpenAI service return AzureChatOpenAI( openai_api_key=get_settings().openai.key, openai_api_version=get_settings().openai.api_version, azure_deployment=deployment_id, azure_endpoint=get_settings().openai.api_base, ) else: # Using standard OpenAI or other LLM services openai_api_base = get_settings().get("OPENAI.API_BASE", None) if openai_api_base is None or len(openai_api_base) == 0: return ChatOpenAI(openai_api_key=get_settings().openai.key) else: return ChatOpenAI( openai_api_key=get_settings().openai.key, openai_api_base=openai_api_base ) except AttributeError as e: # Handle configuration errors error_msg = f"OpenAI {e.name} is required" if getattr(e, "name") else str(e) get_logger().error(error_msg) raise ValueError(error_msg) from e @retry( retry=retry_if_exception_type(openai.APIError) & retry_if_not_exception_type(openai.RateLimitError), stop=stop_after_attempt(OPENAI_RETRIES), ) async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2, img_path: str = None): if img_path: get_logger().warning(f"Image path is not supported for LangChainOpenAIHandler. Ignoring image path: {img_path}") try: messages = [SystemMessage(content=system), HumanMessage(content=user)] llm = await self._create_chat_async(deployment_id=self.deployment_id) if not isinstance(llm, Runnable): error_message = ( f"The Langchain LLM object ({type(llm)}) does not implement the Runnable interface. " f"Please update your Langchain library to the latest version or " f"check your LLM configuration to support async calls. " f"PR-Agent is designed to utilize Langchain's async capabilities." ) get_logger().error(error_message) raise NotImplementedError(error_message) # Handle parameters based on LLM type if isinstance(llm, (ChatOpenAI, AzureChatOpenAI)): # OpenAI models support all parameters resp = await llm.ainvoke( input=messages, model=model, temperature=temperature ) else: # Other LLMs (like Gemini) only support input parameter get_logger().info(f"Using simplified ainvoke for {type(llm)}") resp = await llm.ainvoke(input=messages) finish_reason = "completed" return resp.content, finish_reason except openai.RateLimitError as e: get_logger().error(f"Rate limit error during LLM inference: {e}") raise except openai.APIError as e: get_logger().warning(f"Error during LLM inference: {e}") raise except Exception as e: get_logger().warning(f"Unknown error during LLM inference: {e}") raise openai.APIError from e ================================================ FILE: pr_agent/algo/ai_handlers/litellm_ai_handler.py ================================================ import os import litellm import openai import requests from litellm import acompletion from tenacity import retry, retry_if_exception_type, retry_if_not_exception_type, stop_after_attempt from pr_agent.algo import CLAUDE_EXTENDED_THINKING_MODELS, NO_SUPPORT_TEMPERATURE_MODELS, SUPPORT_REASONING_EFFORT_MODELS, USER_MESSAGE_ONLY_MODELS, STREAMING_REQUIRED_MODELS from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_helpers import _handle_streaming_response, MockResponse, _get_azure_ad_token, \ _process_litellm_extra_body from pr_agent.algo.utils import ReasoningEffort, get_version from pr_agent.config_loader import get_settings from pr_agent.log import get_logger import json MODEL_RETRIES = 2 class LiteLLMAIHandler(BaseAiHandler): """ This class handles interactions with the OpenAI API for chat completions. It initializes the API key and other settings from a configuration file, and provides a method for performing chat completions using the OpenAI ChatCompletion API. """ def __init__(self): """ Initializes the OpenAI API key and other settings from a configuration file. Raises a ValueError if the OpenAI key is missing. """ self.azure = False self.api_base = None self.repetition_penalty = None if get_settings().get("LITELLM.DISABLE_AIOHTTP", False): litellm.disable_aiohttp_transport = True if get_settings().get("OPENAI.KEY", None): openai.api_key = get_settings().openai.key litellm.openai_key = get_settings().openai.key elif 'OPENAI_API_KEY' not in os.environ: litellm.api_key = "dummy_key" if get_settings().get("aws.AWS_ACCESS_KEY_ID"): assert get_settings().aws.AWS_SECRET_ACCESS_KEY and get_settings().aws.AWS_REGION_NAME, "AWS credentials are incomplete" os.environ["AWS_ACCESS_KEY_ID"] = get_settings().aws.AWS_ACCESS_KEY_ID os.environ["AWS_SECRET_ACCESS_KEY"] = get_settings().aws.AWS_SECRET_ACCESS_KEY os.environ["AWS_REGION_NAME"] = get_settings().aws.AWS_REGION_NAME if get_settings().get("LITELLM.DROP_PARAMS", None): litellm.drop_params = get_settings().litellm.drop_params if get_settings().get("LITELLM.SUCCESS_CALLBACK", None): litellm.success_callback = get_settings().litellm.success_callback if get_settings().get("LITELLM.FAILURE_CALLBACK", None): litellm.failure_callback = get_settings().litellm.failure_callback if get_settings().get("LITELLM.SERVICE_CALLBACK", None): litellm.service_callback = get_settings().litellm.service_callback if get_settings().get("OPENAI.ORG", None): litellm.organization = get_settings().openai.org if get_settings().get("OPENAI.API_TYPE", None): if get_settings().openai.api_type == "azure": self.azure = True litellm.azure_key = get_settings().openai.key if get_settings().get("OPENAI.API_VERSION", None): litellm.api_version = get_settings().openai.api_version if get_settings().get("OPENAI.API_BASE", None): litellm.api_base = get_settings().openai.api_base self.api_base = get_settings().openai.api_base if get_settings().get("ANTHROPIC.KEY", None): litellm.anthropic_key = get_settings().anthropic.key if get_settings().get("COHERE.KEY", None): litellm.cohere_key = get_settings().cohere.key if get_settings().get("GROQ.KEY", None): litellm.api_key = get_settings().groq.key if get_settings().get("REPLICATE.KEY", None): litellm.replicate_key = get_settings().replicate.key if get_settings().get("XAI.KEY", None): litellm.api_key = get_settings().xai.key if get_settings().get("HUGGINGFACE.KEY", None): litellm.huggingface_key = get_settings().huggingface.key if get_settings().get("HUGGINGFACE.API_BASE", None) and 'huggingface' in get_settings().config.model: litellm.api_base = get_settings().huggingface.api_base self.api_base = get_settings().huggingface.api_base if get_settings().get("OLLAMA.API_BASE", None): litellm.api_base = get_settings().ollama.api_base self.api_base = get_settings().ollama.api_base if get_settings().get("HUGGINGFACE.REPETITION_PENALTY", None): self.repetition_penalty = float(get_settings().huggingface.repetition_penalty) if get_settings().get("VERTEXAI.VERTEX_PROJECT", None): litellm.vertex_project = get_settings().vertexai.vertex_project litellm.vertex_location = get_settings().get( "VERTEXAI.VERTEX_LOCATION", None ) # Google AI Studio # SEE https://docs.litellm.ai/docs/providers/gemini if get_settings().get("GOOGLE_AI_STUDIO.GEMINI_API_KEY", None): os.environ["GEMINI_API_KEY"] = get_settings().google_ai_studio.gemini_api_key # Support deepseek models if get_settings().get("DEEPSEEK.KEY", None): os.environ['DEEPSEEK_API_KEY'] = get_settings().get("DEEPSEEK.KEY") # Support deepinfra models if get_settings().get("DEEPINFRA.KEY", None): os.environ['DEEPINFRA_API_KEY'] = get_settings().get("DEEPINFRA.KEY") # Support mistral models if get_settings().get("MISTRAL.KEY", None): os.environ["MISTRAL_API_KEY"] = get_settings().get("MISTRAL.KEY") # Support codestral models if get_settings().get("CODESTRAL.KEY", None): os.environ["CODESTRAL_API_KEY"] = get_settings().get("CODESTRAL.KEY") # Check for Azure AD configuration if get_settings().get("AZURE_AD.CLIENT_ID", None): self.azure = True # Generate access token using Azure AD credentials from settings access_token = _get_azure_ad_token() litellm.api_key = access_token openai.api_key = access_token # Set API base from settings self.api_base = get_settings().azure_ad.api_base litellm.api_base = self.api_base openai.api_base = self.api_base # Support for Openrouter models if get_settings().get("OPENROUTER.KEY", None): openrouter_api_key = get_settings().get("OPENROUTER.KEY", None) os.environ["OPENROUTER_API_KEY"] = openrouter_api_key litellm.api_key = openrouter_api_key openai.api_key = openrouter_api_key openrouter_api_base = get_settings().get("OPENROUTER.API_BASE", "https://openrouter.ai/api/v1") os.environ["OPENROUTER_API_BASE"] = openrouter_api_base self.api_base = openrouter_api_base litellm.api_base = openrouter_api_base # Models that only use user message self.user_message_only_models = USER_MESSAGE_ONLY_MODELS # Model that doesn't support temperature argument self.no_support_temperature_models = NO_SUPPORT_TEMPERATURE_MODELS # Models that support reasoning effort self.support_reasoning_models = SUPPORT_REASONING_EFFORT_MODELS # Models that support extended thinking self.claude_extended_thinking_models = CLAUDE_EXTENDED_THINKING_MODELS # Models that require streaming self.streaming_required_models = STREAMING_REQUIRED_MODELS def prepare_logs(self, response, system, user, resp, finish_reason): response_log = response.dict().copy() response_log['system'] = system response_log['user'] = user response_log['output'] = resp response_log['finish_reason'] = finish_reason if hasattr(self, 'main_pr_language'): response_log['main_pr_language'] = self.main_pr_language else: response_log['main_pr_language'] = 'unknown' return response_log def _configure_claude_extended_thinking(self, model: str, kwargs: dict) -> dict: """ Configure Claude extended thinking parameters if applicable. Args: model (str): The AI model being used kwargs (dict): The keyword arguments for the model call Returns: dict: Updated kwargs with extended thinking configuration """ extended_thinking_budget_tokens = get_settings().config.get("extended_thinking_budget_tokens", 2048) extended_thinking_max_output_tokens = get_settings().config.get("extended_thinking_max_output_tokens", 4096) # Validate extended thinking parameters if not isinstance(extended_thinking_budget_tokens, int) or extended_thinking_budget_tokens <= 0: raise ValueError(f"extended_thinking_budget_tokens must be a positive integer, got {extended_thinking_budget_tokens}") if not isinstance(extended_thinking_max_output_tokens, int) or extended_thinking_max_output_tokens <= 0: raise ValueError(f"extended_thinking_max_output_tokens must be a positive integer, got {extended_thinking_max_output_tokens}") if extended_thinking_max_output_tokens < extended_thinking_budget_tokens: raise ValueError(f"extended_thinking_max_output_tokens ({extended_thinking_max_output_tokens}) must be greater than or equal to extended_thinking_budget_tokens ({extended_thinking_budget_tokens})") kwargs["thinking"] = { "type": "enabled", "budget_tokens": extended_thinking_budget_tokens } if get_settings().config.verbosity_level >= 2: get_logger().info(f"Adding max output tokens {extended_thinking_max_output_tokens} to model {model}, extended thinking budget tokens: {extended_thinking_budget_tokens}") kwargs["max_tokens"] = extended_thinking_max_output_tokens # temperature may only be set to 1 when thinking is enabled if get_settings().config.verbosity_level >= 2: get_logger().info("Temperature may only be set to 1 when thinking is enabled with claude models.") kwargs["temperature"] = 1 return kwargs def add_litellm_callbacks(self, kwargs) -> dict: captured_extra = [] def capture_logs(message): # Parsing the log message and context record = message.record log_entry = {} if record.get('extra', None).get('command', None) is not None: log_entry.update({"command": record['extra']["command"]}) if record.get('extra', {}).get('pr_url', None) is not None: log_entry.update({"pr_url": record['extra']["pr_url"]}) # Append the log entry to the captured_logs list captured_extra.append(log_entry) # Adding the custom sink to Loguru handler_id = get_logger().add(capture_logs) get_logger().debug("Capturing logs for litellm callbacks") get_logger().remove(handler_id) context = captured_extra[0] if len(captured_extra) > 0 else None command = context.get("command", "unknown") pr_url = context.get("pr_url", "unknown") git_provider = get_settings().config.git_provider metadata = dict() callbacks = litellm.success_callback + litellm.failure_callback + litellm.service_callback if "langfuse" in callbacks: metadata.update({ "trace_name": command, "tags": [git_provider, command, f'version:{get_version()}'], "trace_metadata": { "command": command, "pr_url": pr_url, }, }) if "langsmith" in callbacks: metadata.update({ "run_name": command, "tags": [git_provider, command, f'version:{get_version()}'], "extra": { "metadata": { "command": command, "pr_url": pr_url, } }, }) # Adding the captured logs to the kwargs kwargs["metadata"] = metadata return kwargs @property def deployment_id(self): """ Returns the deployment ID for the OpenAI API. """ return get_settings().get("OPENAI.DEPLOYMENT_ID", None) @retry( retry=retry_if_exception_type(openai.APIError) & retry_if_not_exception_type(openai.RateLimitError), stop=stop_after_attempt(MODEL_RETRIES), ) async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2, img_path: str = None): try: resp, finish_reason = None, None deployment_id = self.deployment_id if self.azure: model = 'azure/' + model if 'claude' in model and not system: system = "No system prompt provided" get_logger().warning( "Empty system prompt for claude model. Adding a newline character to prevent OpenAI API error.") messages = [{"role": "system", "content": system}, {"role": "user", "content": user}] if img_path: try: # check if the image link is alive r = requests.head(img_path, allow_redirects=True) if r.status_code == 404: error_msg = f"The image link is not [alive](img_path).\nPlease repost the original image as a comment, and send the question again with 'quote reply' (see [instructions](https://pr-agent-docs.codium.ai/tools/ask/#ask-on-images-using-the-pr-code-as-context))." get_logger().error(error_msg) return f"{error_msg}", "error" except Exception as e: get_logger().error(f"Error fetching image: {img_path}", e) return f"Error fetching image: {img_path}", "error" messages[1]["content"] = [{"type": "text", "text": messages[1]["content"]}, {"type": "image_url", "image_url": {"url": img_path}}] thinking_kwargs_gpt5 = None if model.startswith('gpt-5'): # Use configured reasoning_effort or default to MEDIUM config_effort = get_settings().config.reasoning_effort try: ReasoningEffort(config_effort) effort = config_effort except (ValueError, TypeError): effort = ReasoningEffort.MEDIUM.value if config_effort is not None: get_logger().warning( f"Invalid reasoning_effort '{config_effort}' in config. " f"Using default '{effort}'. Valid values: {[e.value for e in ReasoningEffort]}" ) thinking_kwargs_gpt5 = { "reasoning_effort": effort, "allowed_openai_params": ["reasoning_effort"], } get_logger().info(f"Using reasoning_effort='{effort}' for GPT-5 model") model = 'openai/'+model.replace('_thinking', '') # remove _thinking suffix # Currently, some models do not support a separate system and user prompts if model in self.user_message_only_models or get_settings().config.custom_reasoning_model: user = f"{system}\n\n\n{user}" system = "" get_logger().info(f"Using model {model}, combining system and user prompts") messages = [{"role": "user", "content": user}] kwargs = { "model": model, "deployment_id": deployment_id, "messages": messages, "timeout": get_settings().config.ai_timeout, "api_base": self.api_base, } else: kwargs = { "model": model, "deployment_id": deployment_id, "messages": messages, "timeout": get_settings().config.ai_timeout, "api_base": self.api_base, } # Add temperature only if model supports it if model not in self.no_support_temperature_models and not get_settings().config.custom_reasoning_model: # get_logger().info(f"Adding temperature with value {temperature} to model {model}.") kwargs["temperature"] = temperature if thinking_kwargs_gpt5: kwargs.update(thinking_kwargs_gpt5) if 'temperature' in kwargs: del kwargs['temperature'] # Add reasoning_effort if model supports it if model in self.support_reasoning_models: config_effort = get_settings().config.reasoning_effort try: ReasoningEffort(config_effort) reasoning_effort = config_effort except (ValueError, TypeError): reasoning_effort = ReasoningEffort.MEDIUM.value if config_effort is not None: get_logger().warning( f"Invalid reasoning_effort '{config_effort}' in config. " f"Using default '{reasoning_effort}'. Valid values: {[e.value for e in ReasoningEffort]}" ) get_logger().info(f"Adding reasoning_effort with value {reasoning_effort} to model {model}.") kwargs["reasoning_effort"] = reasoning_effort # https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking if (model in self.claude_extended_thinking_models) and get_settings().config.get("enable_claude_extended_thinking", False): kwargs = self._configure_claude_extended_thinking(model, kwargs) if get_settings().litellm.get("enable_callbacks", False): kwargs = self.add_litellm_callbacks(kwargs) seed = get_settings().config.get("seed", -1) if temperature > 0 and seed >= 0: raise ValueError(f"Seed ({seed}) is not supported with temperature ({temperature}) > 0") elif seed >= 0: get_logger().info(f"Using fixed seed of {seed}") kwargs["seed"] = seed if self.repetition_penalty: kwargs["repetition_penalty"] = self.repetition_penalty #Added support for extra_headers while using litellm to call underlying model, via a api management gateway, would allow for passing custom headers for security and authorization if get_settings().get("LITELLM.EXTRA_HEADERS", None): try: litellm_extra_headers = json.loads(get_settings().litellm.extra_headers) if not isinstance(litellm_extra_headers, dict): raise ValueError("LITELLM.EXTRA_HEADERS must be a JSON object") except json.JSONDecodeError as e: raise ValueError(f"LITELLM.EXTRA_HEADERS contains invalid JSON: {str(e)}") kwargs["extra_headers"] = litellm_extra_headers # Support for custom OpenAI body fields (e.g., Flex Processing) kwargs = _process_litellm_extra_body(kwargs) # Support for Bedrock custom inference profile via model_id model_id = get_settings().get("litellm.model_id") if model_id and 'bedrock/' in model: kwargs["model_id"] = model_id get_logger().info(f"Using Bedrock custom inference profile: {model_id}") get_logger().debug("Prompts", artifact={"system": system, "user": user}) if get_settings().config.verbosity_level >= 2: get_logger().info(f"\nSystem prompt:\n{system}") get_logger().info(f"\nUser prompt:\n{user}") # Get completion with automatic streaming detection resp, finish_reason, response_obj = await self._get_completion(**kwargs) except openai.RateLimitError as e: get_logger().error(f"Rate limit error during LLM inference: {e}") raise except openai.APIError as e: get_logger().warning(f"Error during LLM inference: {e}") raise except Exception as e: get_logger().warning(f"Unknown error during LLM inference: {e}") raise openai.APIError from e get_logger().debug(f"\nAI response:\n{resp}") # log the full response for debugging response_log = self.prepare_logs(response_obj, system, user, resp, finish_reason) get_logger().debug("Full_response", artifact=response_log) # for CLI debugging if get_settings().config.verbosity_level >= 2: get_logger().info(f"\nAI response:\n{resp}") return resp, finish_reason async def _get_completion(self, **kwargs): """ Wrapper that automatically handles streaming for required models. """ model = kwargs["model"] if model in self.streaming_required_models: kwargs["stream"] = True get_logger().info(f"Using streaming mode for model {model}") response = await acompletion(**kwargs) resp, finish_reason = await _handle_streaming_response(response) # Create MockResponse for streaming since we don't have the full response object mock_response = MockResponse(resp, finish_reason) return resp, finish_reason, mock_response else: response = await acompletion(**kwargs) if response is None or len(response["choices"]) == 0: raise openai.APIError return (response["choices"][0]['message']['content'], response["choices"][0]["finish_reason"], response) ================================================ FILE: pr_agent/algo/ai_handlers/litellm_helpers.py ================================================ import json import openai from pr_agent.config_loader import get_settings from pr_agent.log import get_logger async def _handle_streaming_response(response): """ Handle streaming response from acompletion and collect the full response. Args: response: The streaming response object from acompletion Returns: tuple: (full_response_content, finish_reason) """ full_response = "" finish_reason = None try: async for chunk in response: if chunk.choices and len(chunk.choices) > 0: choice = chunk.choices[0] delta = choice.delta content = getattr(delta, 'content', None) if content: full_response += content if choice.finish_reason: finish_reason = choice.finish_reason except Exception as e: get_logger().error(f"Error handling streaming response: {e}") raise if not full_response and finish_reason is None: get_logger().warning("Streaming response resulted in empty content with no finish reason") raise openai.APIError("Empty streaming response received without proper completion") elif not full_response and finish_reason: get_logger().debug(f"Streaming response resulted in empty content but completed with finish_reason: {finish_reason}") raise openai.APIError(f"Streaming response completed with finish_reason '{finish_reason}' but no content received") return full_response, finish_reason class MockResponse: """Mock response object for streaming models to enable consistent logging.""" def __init__(self, resp, finish_reason): self._data = { "choices": [ { "message": {"content": resp}, "finish_reason": finish_reason } ] } def dict(self): return self._data def _get_azure_ad_token(): """ Generates an access token using Azure AD credentials from settings. Returns: str: The access token """ from azure.identity import ClientSecretCredential try: credential = ClientSecretCredential( tenant_id=get_settings().azure_ad.tenant_id, client_id=get_settings().azure_ad.client_id, client_secret=get_settings().azure_ad.client_secret ) # Get token for Azure OpenAI service token = credential.get_token("https://cognitiveservices.azure.com/.default") return token.token except Exception as e: get_logger().error(f"Failed to get Azure AD token: {e}") raise def _process_litellm_extra_body(kwargs: dict) -> dict: """ Process LITELLM.EXTRA_BODY configuration and update kwargs accordingly. Args: kwargs: The current kwargs dictionary to update Returns: Updated kwargs dictionary Raises: ValueError: If extra_body contains invalid JSON, unsupported keys, or colliding keys """ allowed_extra_body_keys = {"processing_mode", "service_tier"} extra_body = getattr(getattr(get_settings(), "litellm", None), "extra_body", None) if extra_body: try: litellm_extra_body = json.loads(extra_body) if not isinstance(litellm_extra_body, dict): raise ValueError("LITELLM.EXTRA_BODY must be a JSON object") unsupported_keys = set(litellm_extra_body.keys()) - allowed_extra_body_keys if unsupported_keys: raise ValueError(f"LITELLM.EXTRA_BODY contains unsupported keys: {', '.join(unsupported_keys)}. Allowed keys: {', '.join(allowed_extra_body_keys)}") colliding_keys = kwargs.keys() & litellm_extra_body.keys() if colliding_keys: raise ValueError(f"LITELLM.EXTRA_BODY cannot override existing parameters: {', '.join(colliding_keys)}") kwargs.update(litellm_extra_body) except json.JSONDecodeError as e: raise ValueError(f"LITELLM.EXTRA_BODY contains invalid JSON: {str(e)}") return kwargs ================================================ FILE: pr_agent/algo/ai_handlers/openai_ai_handler.py ================================================ from os import environ from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler import openai from openai import AsyncOpenAI from tenacity import retry, retry_if_exception_type, retry_if_not_exception_type, stop_after_attempt from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.config_loader import get_settings from pr_agent.log import get_logger OPENAI_RETRIES = 5 class OpenAIHandler(BaseAiHandler): def __init__(self): # Initialize OpenAIHandler specific attributes here try: super().__init__() environ["OPENAI_API_KEY"] = get_settings().openai.key if get_settings().get("OPENAI.ORG", None): openai.organization = get_settings().openai.org if get_settings().get("OPENAI.API_TYPE", None): if get_settings().openai.api_type == "azure": self.azure = True openai.azure_key = get_settings().openai.key if get_settings().get("OPENAI.API_VERSION", None): openai.api_version = get_settings().openai.api_version if get_settings().get("OPENAI.API_BASE", None): environ["OPENAI_BASE_URL"] = get_settings().openai.api_base except AttributeError as e: raise ValueError("OpenAI key is required") from e @property def deployment_id(self): """ Returns the deployment ID for the OpenAI API. """ return get_settings().get("OPENAI.DEPLOYMENT_ID", None) @retry( retry=retry_if_exception_type(openai.APIError) & retry_if_not_exception_type(openai.RateLimitError), stop=stop_after_attempt(OPENAI_RETRIES), ) async def chat_completion(self, model: str, system: str, user: str, temperature: float = 0.2, img_path: str = None): try: if img_path: get_logger().warning(f"Image path is not supported for OpenAIHandler. Ignoring image path: {img_path}") get_logger().info("System: ", system) get_logger().info("User: ", user) messages = [{"role": "system", "content": system}, {"role": "user", "content": user}] client = AsyncOpenAI() chat_completion = await client.chat.completions.create( model=model, messages=messages, temperature=temperature, ) resp = chat_completion.choices[0].message.content finish_reason = chat_completion.choices[0].finish_reason usage = chat_completion.usage get_logger().info("AI response", response=resp, messages=messages, finish_reason=finish_reason, model=model, usage=usage) return resp, finish_reason except openai.RateLimitError as e: get_logger().error(f"Rate limit error during LLM inference: {e}") raise except openai.APIError as e: get_logger().warning(f"Error during LLM inference: {e}") raise except Exception as e: get_logger().warning(f"Unknown error during LLM inference: {e}") raise openai.APIError from e ================================================ FILE: pr_agent/algo/cli_args.py ================================================ from base64 import b64decode, encode, b64encode import hashlib class CliArgs: @staticmethod def validate_user_args(args: list) -> (bool, str): try: if not args: return True, "" # decode forbidden args # b64encode('word'.encode()).decode() _encoded_args = 'c2hhcmVkX3NlY3JldA==:dXNlcg==:c3lzdGVt:ZW5hYmxlX2NvbW1lbnRfYXBwcm92YWw=:ZW5hYmxlX21hbnVhbF9hcHByb3ZhbA==:ZW5hYmxlX2F1dG9fYXBwcm92YWw=:YXBwcm92ZV9wcl9vbl9zZWxmX3Jldmlldw==:YmFzZV91cmw=:dXJs:YXBwX25hbWU=:c2VjcmV0X3Byb3ZpZGVy:Z2l0X3Byb3ZpZGVy:c2tpcF9rZXlz:b3BlbmFpLmtleQ==:QU5BTFlUSUNTX0ZPTERFUg==:dXJp:YXBwX2lk:d2ViaG9va19zZWNyZXQ=:YmVhcmVyX3Rva2Vu:UEVSU09OQUxfQUNDRVNTX1RPS0VO:b3ZlcnJpZGVfZGVwbG95bWVudF90eXBl:cHJpdmF0ZV9rZXk=:bG9jYWxfY2FjaGVfcGF0aA==:ZW5hYmxlX2xvY2FsX2NhY2hl:amlyYV9iYXNlX3VybA==:YXBpX2Jhc2U=:YXBpX3R5cGU=:YXBpX3ZlcnNpb24=:c2tpcF9rZXlz' forbidden_cli_args = [] for e in _encoded_args.split(':'): forbidden_cli_args.append(b64decode(e).decode()) # lowercase all forbidden args for i, _ in enumerate(forbidden_cli_args): forbidden_cli_args[i] = forbidden_cli_args[i].lower() if '.' not in forbidden_cli_args[i]: forbidden_cli_args[i] = '.' + forbidden_cli_args[i] for arg in args: if arg.startswith('--'): arg_word = arg.lower() arg_word = arg_word.replace('__', '.') # replace double underscore with dot, e.g. --openai__key -> --openai.key for forbidden_arg_word in forbidden_cli_args: if forbidden_arg_word in arg_word: return False, forbidden_arg_word return True, "" except Exception as e: return False, str(e) ================================================ FILE: pr_agent/algo/file_filter.py ================================================ import fnmatch import re from pr_agent.config_loader import get_settings from pr_agent.log import get_logger def filter_ignored(files, platform = 'github'): """ Filter out files that match the ignore patterns. """ try: # load regex patterns, and translate glob patterns to regex patterns = get_settings().ignore.regex if isinstance(patterns, str): patterns = [patterns] glob_setting = get_settings().ignore.glob if isinstance(glob_setting, str): # --ignore.glob=[.*utils.py], --ignore.glob=.*utils.py glob_setting = glob_setting.strip('[]').split(",") patterns += translate_globs_to_regexes(glob_setting) code_generators = get_settings().config.get('ignore_language_framework', []) if isinstance(code_generators, str): get_logger().warning("'ignore_language_framework' should be a list. Skipping language framework filtering.") code_generators = [] for cg in code_generators: glob_patterns = get_settings().generated_code.get(cg, []) if isinstance(glob_patterns, str): glob_patterns = [glob_patterns] patterns += translate_globs_to_regexes(glob_patterns) # compile all valid patterns compiled_patterns = [] for r in patterns: try: compiled_patterns.append(re.compile(r)) except re.error: pass # keep filenames that _don't_ match the ignore regex if files and isinstance(files, list): for r in compiled_patterns: if platform == 'github': files = [f for f in files if (f.filename and not r.match(f.filename))] elif platform == 'bitbucket': # files = [f for f in files if (f.new.path and not r.match(f.new.path))] files_o = [] for f in files: if hasattr(f, 'new'): if f.new and f.new.path and not r.match(f.new.path): files_o.append(f) continue if hasattr(f, 'old'): if f.old and f.old.path and not r.match(f.old.path): files_o.append(f) continue files = files_o elif platform == 'bitbucket_server': files = [f for f in files if f.get('path', {}).get('toString') and not r.match(f['path']['toString'])] elif platform == 'gitlab': # files = [f for f in files if (f['new_path'] and not r.match(f['new_path']))] files_o = [] for f in files: if 'new_path' in f and f['new_path'] and not r.match(f['new_path']): files_o.append(f) continue if 'old_path' in f and f['old_path'] and not r.match(f['old_path']): files_o.append(f) continue files = files_o elif platform == 'azure': files = [f for f in files if not r.match(f)] elif platform == 'gitea': files = [f for f in files if not r.match(f.get("filename", ""))] except Exception as e: print(f"Could not filter file list: {e}") return files def translate_globs_to_regexes(globs: list): regexes = [] for pattern in globs: regexes.append(fnmatch.translate(pattern)) if pattern.startswith("**/"): # cover root-level files regexes.append(fnmatch.translate(pattern[3:])) return regexes ================================================ FILE: pr_agent/algo/git_patch_processing.py ================================================ from __future__ import annotations import re import traceback from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from pr_agent.config_loader import get_settings from pr_agent.log import get_logger def extend_patch(original_file_str, patch_str, patch_extra_lines_before=0, patch_extra_lines_after=0, filename: str = "", new_file_str="") -> str: if not patch_str or (patch_extra_lines_before == 0 and patch_extra_lines_after == 0) or not original_file_str: return patch_str original_file_str = decode_if_bytes(original_file_str) new_file_str = decode_if_bytes(new_file_str) if not original_file_str: return patch_str if should_skip_patch(filename): return patch_str try: extended_patch_str = process_patch_lines(patch_str, original_file_str, patch_extra_lines_before, patch_extra_lines_after, new_file_str) except Exception as e: get_logger().warning(f"Failed to extend patch: {e}", artifact={"traceback": traceback.format_exc()}) return patch_str return extended_patch_str def decode_if_bytes(original_file_str): if isinstance(original_file_str, (bytes, bytearray)): try: return original_file_str.decode('utf-8') except UnicodeDecodeError: encodings_to_try = ['iso-8859-1', 'latin-1', 'ascii', 'utf-16'] for encoding in encodings_to_try: try: return original_file_str.decode(encoding) except UnicodeDecodeError: continue return "" return original_file_str def should_skip_patch(filename): patch_extension_skip_types = get_settings().config.patch_extension_skip_types if patch_extension_skip_types and filename: return any(filename.endswith(skip_type) for skip_type in patch_extension_skip_types) return False def process_patch_lines(patch_str, original_file_str, patch_extra_lines_before, patch_extra_lines_after, new_file_str=""): allow_dynamic_context = get_settings().config.allow_dynamic_context patch_extra_lines_before_dynamic = get_settings().config.max_extra_lines_before_dynamic_context file_original_lines = original_file_str.splitlines() file_new_lines = new_file_str.splitlines() if new_file_str else [] len_original_lines = len(file_original_lines) patch_lines = patch_str.splitlines() extended_patch_lines = [] is_valid_hunk = True start1, size1, start2, size2 = -1, -1, -1, -1 RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") try: for i,line in enumerate(patch_lines): if line.startswith('@@'): match = RE_HUNK_HEADER.match(line) # identify hunk header if match: # finish processing previous hunk if is_valid_hunk and (start1 != -1 and patch_extra_lines_after > 0): delta_lines_original = [f' {line}' for line in file_original_lines[start1 + size1 - 1:start1 + size1 - 1 + patch_extra_lines_after]] extended_patch_lines.extend(delta_lines_original) section_header, size1, size2, start1, start2 = extract_hunk_headers(match) is_valid_hunk = check_if_hunk_lines_matches_to_file(i, file_original_lines, patch_lines, start1) if is_valid_hunk and (patch_extra_lines_before > 0 or patch_extra_lines_after > 0): def _calc_context_limits(patch_lines_before): extended_start1 = max(1, start1 - patch_lines_before) extended_size1 = size1 + (start1 - extended_start1) + patch_extra_lines_after extended_start2 = max(1, start2 - patch_lines_before) extended_size2 = size2 + (start2 - extended_start2) + patch_extra_lines_after if extended_start1 - 1 + extended_size1 > len_original_lines: # we cannot extend beyond the original file delta_cap = extended_start1 - 1 + extended_size1 - len_original_lines extended_size1 = max(extended_size1 - delta_cap, size1) extended_size2 = max(extended_size2 - delta_cap, size2) return extended_start1, extended_size1, extended_start2, extended_size2 if allow_dynamic_context and file_new_lines: extended_start1, extended_size1, extended_start2, extended_size2 = \ _calc_context_limits(patch_extra_lines_before_dynamic) lines_before_original = file_original_lines[extended_start1 - 1:start1 - 1] lines_before_new = file_new_lines[extended_start2 - 1:start2 - 1] found_header = False for i, line in enumerate(lines_before_original): if section_header in line: # Update start and size in one line each extended_start1, extended_start2 = extended_start1 + i, extended_start2 + i extended_size1, extended_size2 = extended_size1 - i, extended_size2 - i lines_before_original_dynamic_context = lines_before_original[i:] lines_before_new_dynamic_context = lines_before_new[i:] if lines_before_original_dynamic_context == lines_before_new_dynamic_context: # get_logger().debug(f"found dynamic context match for section header: {section_header}") found_header = True section_header = '' else: pass # its ok to be here. We cant apply dynamic context if the lines are different if 'old' and 'new' hunks break if not found_header: # get_logger().debug(f"Section header not found in the extra lines before the hunk") extended_start1, extended_size1, extended_start2, extended_size2 = \ _calc_context_limits(patch_extra_lines_before) else: extended_start1, extended_size1, extended_start2, extended_size2 = \ _calc_context_limits(patch_extra_lines_before) # check if extra lines before hunk are different in original and new file delta_lines_original = [f' {line}' for line in file_original_lines[extended_start1 - 1:start1 - 1]] if file_new_lines: delta_lines_new = [f' {line}' for line in file_new_lines[extended_start2 - 1:start2 - 1]] if delta_lines_original != delta_lines_new: found_mini_match = False for i in range(len(delta_lines_original)): if delta_lines_original[i:] == delta_lines_new[i:]: delta_lines_original = delta_lines_original[i:] delta_lines_new = delta_lines_new[i:] extended_start1 += i extended_size1 -= i extended_start2 += i extended_size2 -= i found_mini_match = True break if not found_mini_match: extended_start1 = start1 extended_size1 = size1 extended_start2 = start2 extended_size2 = size2 delta_lines_original = [] # get_logger().debug(f"Extra lines before hunk are different in original and new file", # artifact={"delta_lines_original": delta_lines_original, # "delta_lines_new": delta_lines_new}) # logic to remove section header if its in the extra delta lines (in dynamic context, this is also done) if section_header and not allow_dynamic_context: for line in delta_lines_original: if section_header in line: section_header = '' # remove section header if it is in the extra delta lines break else: extended_start1 = start1 extended_size1 = size1 extended_start2 = start2 extended_size2 = size2 delta_lines_original = [] extended_patch_lines.append('') extended_patch_lines.append( f'@@ -{extended_start1},{extended_size1} ' f'+{extended_start2},{extended_size2} @@ {section_header}') extended_patch_lines.extend(delta_lines_original) # one to zero based continue extended_patch_lines.append(line) except Exception as e: get_logger().warning(f"Failed to extend patch: {e}", artifact={"traceback": traceback.format_exc()}) return patch_str # finish processing last hunk if start1 != -1 and patch_extra_lines_after > 0 and is_valid_hunk: delta_lines_original = file_original_lines[start1 + size1 - 1:start1 + size1 - 1 + patch_extra_lines_after] # add space at the beginning of each extra line delta_lines_original = [f' {line}' for line in delta_lines_original] extended_patch_lines.extend(delta_lines_original) extended_patch_str = '\n'.join(extended_patch_lines) return extended_patch_str def check_if_hunk_lines_matches_to_file(i, original_lines, patch_lines, start1): """ Check if the hunk lines match the original file content. We saw cases where the hunk header line doesn't match the original file content, and then extending the hunk with extra lines before the hunk header can cause the hunk to be invalid. """ is_valid_hunk = True try: if i + 1 < len(patch_lines) and patch_lines[i + 1][0] == ' ': # an existing line in the file if patch_lines[i + 1].strip() != original_lines[start1 - 1].strip(): # check if different encoding is needed original_line = original_lines[start1 - 1].strip() for encoding in ['iso-8859-1', 'latin-1', 'ascii', 'utf-16']: try: if original_line.encode(encoding).decode().strip() == patch_lines[i + 1].strip(): get_logger().info(f"Detected different encoding in hunk header line {start1}, needed encoding: {encoding}") return False # we still want to avoid extending the hunk. But we don't want to log an error except: pass is_valid_hunk = False get_logger().info( f"Invalid hunk in PR, line {start1} in hunk header doesn't match the original file content") except: pass return is_valid_hunk def extract_hunk_headers(match): res = list(match.groups()) for i in range(len(res)): if res[i] is None: res[i] = 0 try: start1, size1, start2, size2 = map(int, res[:4]) except: # '@@ -0,0 +1 @@' case start1, size1, size2 = map(int, res[:3]) start2 = 0 section_header = res[4] return section_header, size1, size2, start1, start2 def omit_deletion_hunks(patch_lines) -> str: """ Omit deletion hunks from the patch and return the modified patch. Args: - patch_lines: a list of strings representing the lines of the patch Returns: - A string representing the modified patch with deletion hunks omitted """ temp_hunk = [] added_patched = [] add_hunk = False inside_hunk = False RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))?\ @@[ ]?(.*)") for line in patch_lines: if line.startswith('@@'): match = RE_HUNK_HEADER.match(line) if match: # finish previous hunk if inside_hunk and add_hunk: added_patched.extend(temp_hunk) temp_hunk = [] add_hunk = False temp_hunk.append(line) inside_hunk = True else: temp_hunk.append(line) if line: edit_type = line[0] if edit_type == '+': add_hunk = True if inside_hunk and add_hunk: added_patched.extend(temp_hunk) return '\n'.join(added_patched) def handle_patch_deletions(patch: str, original_file_content_str: str, new_file_content_str: str, file_name: str, edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN) -> str: """ Handle entire file or deletion patches. This function takes a patch, original file content, new file content, and file name as input. It handles entire file or deletion patches and returns the modified patch with deletion hunks omitted. Args: patch (str): The patch to be handled. original_file_content_str (str): The original content of the file. new_file_content_str (str): The new content of the file. file_name (str): The name of the file. Returns: str: The modified patch with deletion hunks omitted. """ if not new_file_content_str and (edit_type == EDIT_TYPE.DELETED or edit_type == EDIT_TYPE.UNKNOWN): # logic for handling deleted files - don't show patch, just show that the file was deleted if get_settings().config.verbosity_level > 0: get_logger().info(f"Processing file: {file_name}, minimizing deletion file") patch = None # file was deleted else: patch_lines = patch.splitlines() patch_new = omit_deletion_hunks(patch_lines) if patch != patch_new: if get_settings().config.verbosity_level > 0: get_logger().info(f"Processing file: {file_name}, hunks were deleted") patch = patch_new return patch def decouple_and_convert_to_hunks_with_lines_numbers(patch: str, file) -> str: """ Convert a given patch string into a string with line numbers for each hunk, indicating the new and old content of the file. Args: patch (str): The patch string to be converted. file: An object containing the filename of the file being patched. Returns: str: A string with line numbers for each hunk, indicating the new and old content of the file. example output: ## src/file.ts __new hunk__ 881 line1 882 line2 883 line3 887 + line4 888 + line5 889 line6 890 line7 ... __old hunk__ line1 line2 - line3 - line4 line5 line6 ... """ # Add a header for the file if file: # if the file was deleted, return a message indicating that the file was deleted if hasattr(file, 'edit_type') and file.edit_type == EDIT_TYPE.DELETED: return f"\n\n## File '{file.filename.strip()}' was deleted\n" patch_with_lines_str = f"\n\n## File: '{file.filename.strip()}'\n" else: patch_with_lines_str = "" patch_lines = patch.splitlines() RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") new_content_lines = [] old_content_lines = [] match = None start1, size1, start2, size2 = -1, -1, -1, -1 prev_header_line = [] header_line = [] for line_i, line in enumerate(patch_lines): if 'no newline at end of file' in line.lower(): continue if line.startswith('@@'): header_line = line match = RE_HUNK_HEADER.match(line) if match and (new_content_lines or old_content_lines): # found a new hunk, split the previous lines if prev_header_line: patch_with_lines_str += f'\n{prev_header_line}\n' is_plus_lines = is_minus_lines = False if new_content_lines: is_plus_lines = any([line.startswith('+') for line in new_content_lines]) if old_content_lines: is_minus_lines = any([line.startswith('-') for line in old_content_lines]) if is_plus_lines or is_minus_lines: # notice 'True' here - we always present __new hunk__ for section, otherwise LLM gets confused patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n' for i, line_new in enumerate(new_content_lines): patch_with_lines_str += f"{start2 + i} {line_new}\n" if is_minus_lines: patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n' for line_old in old_content_lines: patch_with_lines_str += f"{line_old}\n" new_content_lines = [] old_content_lines = [] if match: prev_header_line = header_line section_header, size1, size2, start1, start2 = extract_hunk_headers(match) elif line.startswith('+'): new_content_lines.append(line) elif line.startswith('-'): old_content_lines.append(line) else: if not line and line_i: # if this line is empty and the next line is a hunk header, skip it if line_i + 1 < len(patch_lines) and patch_lines[line_i + 1].startswith('@@'): continue elif line_i + 1 == len(patch_lines): continue new_content_lines.append(line) old_content_lines.append(line) # finishing last hunk if match and new_content_lines: patch_with_lines_str += f'\n{header_line}\n' is_plus_lines = is_minus_lines = False if new_content_lines: is_plus_lines = any([line.startswith('+') for line in new_content_lines]) if old_content_lines: is_minus_lines = any([line.startswith('-') for line in old_content_lines]) if is_plus_lines or is_minus_lines: # notice 'True' here - we always present __new hunk__ for section, otherwise LLM gets confused patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__new hunk__\n' for i, line_new in enumerate(new_content_lines): patch_with_lines_str += f"{start2 + i} {line_new}\n" if is_minus_lines: patch_with_lines_str = patch_with_lines_str.rstrip() + '\n__old hunk__\n' for line_old in old_content_lines: patch_with_lines_str += f"{line_old}\n" return patch_with_lines_str.rstrip() def extract_hunk_lines_from_patch(patch: str, file_name, line_start, line_end, side, remove_trailing_chars: bool = True) -> tuple[str, str]: try: patch_with_lines_str = f"\n\n## File: '{file_name.strip()}'\n\n" selected_lines = "" patch_lines = patch.splitlines() RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") match = None start1, size1, start2, size2 = -1, -1, -1, -1 skip_hunk = False selected_lines_num = 0 for line in patch_lines: if 'no newline at end of file' in line.lower(): continue if line.startswith('@@'): skip_hunk = False selected_lines_num = 0 header_line = line match = RE_HUNK_HEADER.match(line) section_header, size1, size2, start1, start2 = extract_hunk_headers(match) # check if line range is in this hunk if side.lower() == 'left': # check if line range is in this hunk if not (start1 <= line_start <= start1 + size1): skip_hunk = True continue elif side.lower() == 'right': if not (start2 <= line_start <= start2 + size2): skip_hunk = True continue patch_with_lines_str += f'\n{header_line}\n' elif not skip_hunk: if side.lower() == 'right' and line_start <= start2 + selected_lines_num <= line_end: selected_lines += line + '\n' if side.lower() == 'left' and start1 <= selected_lines_num + start1 <= line_end: selected_lines += line + '\n' patch_with_lines_str += line + '\n' if not line.startswith('-'): # currently we don't support /ask line for deleted lines selected_lines_num += 1 except Exception as e: get_logger().error(f"Failed to extract hunk lines from patch: {e}", artifact={"traceback": traceback.format_exc()}) return "", "" if remove_trailing_chars: patch_with_lines_str = patch_with_lines_str.rstrip() selected_lines = selected_lines.rstrip() return patch_with_lines_str, selected_lines ================================================ FILE: pr_agent/algo/language_handler.py ================================================ # Language Selection, source: https://github.com/bigcode-project/bigcode-dataset/blob/main/language_selection/programming-languages-to-file-extensions.json # noqa E501 from typing import Dict from pr_agent.config_loader import get_settings def filter_bad_extensions(files): # Bad Extensions, source: https://github.com/EleutherAI/github-downloader/blob/345e7c4cbb9e0dc8a0615fd995a08bf9d73b3fe6/download_repo_text.py # noqa: E501 bad_extensions = get_settings().bad_extensions.default if get_settings().config.use_extra_bad_extensions: bad_extensions += get_settings().bad_extensions.extra return [f for f in files if f.filename is not None and is_valid_file(f.filename, bad_extensions)] def is_valid_file(filename:str, bad_extensions=None) -> bool: if not filename: return False if not bad_extensions: bad_extensions = get_settings().bad_extensions.default if get_settings().config.use_extra_bad_extensions: bad_extensions += get_settings().bad_extensions.extra auto_generated_files = ['package-lock.json', 'yarn.lock', 'composer.lock', 'Gemfile.lock', 'poetry.lock'] for forbidden_file in auto_generated_files: if filename.endswith(forbidden_file): return False return filename.split('.')[-1] not in bad_extensions def sort_files_by_main_languages(languages: Dict, files: list): """ Sort files by their main language, put the files that are in the main language first and the rest files after """ # sort languages by their size languages_sorted_list = [k for k, v in sorted(languages.items(), key=lambda item: item[1], reverse=True)] # languages_sorted = sorted(languages, key=lambda x: x[1], reverse=True) # get all extensions for the languages main_extensions = [] language_extension_map_org = get_settings().language_extension_map_org language_extension_map = {k.lower(): v for k, v in language_extension_map_org.items()} for language in languages_sorted_list: if language.lower() in language_extension_map: main_extensions.append(language_extension_map[language.lower()]) else: main_extensions.append([]) # filter out files bad extensions files_filtered = filter_bad_extensions(files) # sort files by their extension, put the files that are in the main extension first # and the rest files after, map languages_sorted to their respective files files_sorted = [] rest_files = {} # if no languages detected, put all files in the "Other" category if not languages: files_sorted = [({"language": "Other", "files": list(files_filtered)})] return files_sorted main_extensions_flat = [] for ext in main_extensions: main_extensions_flat.extend(ext) for extensions, lang in zip(main_extensions, languages_sorted_list): # noqa: B905 tmp = [] for file in files_filtered: extension_str = f".{file.filename.split('.')[-1]}" if extension_str in extensions: tmp.append(file) else: if (file.filename not in rest_files) and (extension_str not in main_extensions_flat): rest_files[file.filename] = file if len(tmp) > 0: files_sorted.append({"language": lang, "files": tmp}) files_sorted.append({"language": "Other", "files": list(rest_files.values())}) return files_sorted ================================================ FILE: pr_agent/algo/pr_processing.py ================================================ from __future__ import annotations import traceback from typing import Callable, List, Tuple from github import RateLimitExceededException from pr_agent.algo.file_filter import filter_ignored from pr_agent.algo.git_patch_processing import ( extend_patch, handle_patch_deletions, decouple_and_convert_to_hunks_with_lines_numbers) from pr_agent.algo.language_handler import sort_files_by_main_languages from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from pr_agent.algo.utils import ModelType, clip_tokens, get_max_tokens, get_model from pr_agent.config_loader import get_settings from pr_agent.git_providers.git_provider import GitProvider from pr_agent.log import get_logger DELETED_FILES_ = "Deleted files:\n" MORE_MODIFIED_FILES_ = "Additional modified files (insufficient token budget to process):\n" ADDED_FILES_ = "Additional added files (insufficient token budget to process):\n" OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD = 1500 OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD = 1000 MAX_EXTRA_LINES = 10 def cap_and_log_extra_lines(value, direction) -> int: if value > MAX_EXTRA_LINES: get_logger().warning(f"patch_extra_lines_{direction} was {value}, capping to {MAX_EXTRA_LINES}") return MAX_EXTRA_LINES return value def get_pr_diff(git_provider: GitProvider, token_handler: TokenHandler, model: str, add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False, large_pr_handling=False, return_remaining_files=False): if disable_extra_lines: PATCH_EXTRA_LINES_BEFORE = 0 PATCH_EXTRA_LINES_AFTER = 0 else: PATCH_EXTRA_LINES_BEFORE = get_settings().config.patch_extra_lines_before PATCH_EXTRA_LINES_AFTER = get_settings().config.patch_extra_lines_after PATCH_EXTRA_LINES_BEFORE = cap_and_log_extra_lines(PATCH_EXTRA_LINES_BEFORE, "before") PATCH_EXTRA_LINES_AFTER = cap_and_log_extra_lines(PATCH_EXTRA_LINES_AFTER, "after") try: diff_files = git_provider.get_diff_files() except RateLimitExceededException as e: get_logger().error(f"Rate limit exceeded for git provider API. original message {e}") raise # get pr languages pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) if pr_languages: try: get_logger().info(f"PR main language: {pr_languages[0]['language']}") except Exception as e: pass # generate a standard diff string, with patch extension patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff( pr_languages, token_handler, add_line_numbers_to_hunks, patch_extra_lines_before=PATCH_EXTRA_LINES_BEFORE, patch_extra_lines_after=PATCH_EXTRA_LINES_AFTER) # if we are under the limit, return the full diff if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model): get_logger().info(f"Tokens: {total_tokens}, total tokens under limit: {get_max_tokens(model)}, " f"returning full diff.") return "\n".join(patches_extended) # if we are over the limit, start pruning (If we got here, we will not extend the patches with extra lines) get_logger().info(f"Tokens: {total_tokens}, total tokens over limit: {get_max_tokens(model)}, " f"pruning diff.") patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \ pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks, large_pr_handling) if large_pr_handling and len(patches_compressed_list) > 1: get_logger().info(f"Large PR handling mode, and found {len(patches_compressed_list)} patches with original diff.") return "" # return empty string, as we want to generate multiple patches with a different prompt # return the first patch patches_compressed = patches_compressed_list[0] total_tokens_new = total_tokens_list[0] files_in_patch = files_in_patches_list[0] # Insert additional information about added, modified, and deleted files if there is enough space max_tokens = get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD curr_token = total_tokens_new # == token_handler.count_tokens(final_diff)+token_handler.prompt_tokens final_diff = "\n".join(patches_compressed) delta_tokens = 10 added_list_str = modified_list_str = deleted_list_str = "" unprocessed_files = [] # generate the added, modified, and deleted files lists if (max_tokens - curr_token) > delta_tokens: for filename, file_values in file_dict.items(): if filename in files_in_patch: continue if file_values['edit_type'] == EDIT_TYPE.ADDED: unprocessed_files.append(filename) if not added_list_str: added_list_str = ADDED_FILES_ + f"\n{filename}" else: added_list_str = added_list_str + f"\n{filename}" elif file_values['edit_type'] in [EDIT_TYPE.MODIFIED, EDIT_TYPE.RENAMED]: unprocessed_files.append(filename) if not modified_list_str: modified_list_str = MORE_MODIFIED_FILES_ + f"\n{filename}" else: modified_list_str = modified_list_str + f"\n{filename}" elif file_values['edit_type'] == EDIT_TYPE.DELETED: # unprocessed_files.append(filename) # not needed here, because the file was deleted, so no need to process it if not deleted_list_str: deleted_list_str = DELETED_FILES_ + f"\n{filename}" else: deleted_list_str = deleted_list_str + f"\n{filename}" # prune the added, modified, and deleted files lists, and add them to the final diff added_list_str = clip_tokens(added_list_str, max_tokens - curr_token) if added_list_str: final_diff = final_diff + "\n\n" + added_list_str curr_token += token_handler.count_tokens(added_list_str) + 2 modified_list_str = clip_tokens(modified_list_str, max_tokens - curr_token) if modified_list_str: final_diff = final_diff + "\n\n" + modified_list_str curr_token += token_handler.count_tokens(modified_list_str) + 2 deleted_list_str = clip_tokens(deleted_list_str, max_tokens - curr_token) if deleted_list_str: final_diff = final_diff + "\n\n" + deleted_list_str get_logger().debug(f"After pruning, added_list_str: {added_list_str}, modified_list_str: {modified_list_str}, " f"deleted_list_str: {deleted_list_str}") if not return_remaining_files: return final_diff else: return final_diff, remaining_files_list def get_pr_diff_multiple_patchs(git_provider: GitProvider, token_handler: TokenHandler, model: str, add_line_numbers_to_hunks: bool = False, disable_extra_lines: bool = False): try: diff_files = git_provider.get_diff_files() except RateLimitExceededException as e: get_logger().error(f"Rate limit exceeded for git provider API. original message {e}") raise # get pr languages pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) if pr_languages: try: get_logger().info(f"PR main language: {pr_languages[0]['language']}") except Exception as e: pass patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list = \ pr_generate_compressed_diff(pr_languages, token_handler, model, add_line_numbers_to_hunks, large_pr_handling=True) return patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list def pr_generate_extended_diff(pr_languages: list, token_handler: TokenHandler, add_line_numbers_to_hunks: bool, patch_extra_lines_before: int = 0, patch_extra_lines_after: int = 0) -> Tuple[list, int, list]: total_tokens = token_handler.prompt_tokens # initial tokens patches_extended = [] patches_extended_tokens = [] for lang in pr_languages: for file in lang['files']: original_file_content_str = file.base_file new_file_content_str = file.head_file patch = file.patch if not patch: continue # extend each patch with extra lines of context extended_patch = extend_patch(original_file_content_str, patch, patch_extra_lines_before, patch_extra_lines_after, file.filename, new_file_str=new_file_content_str) if not extended_patch: get_logger().warning(f"Failed to extend patch for file: {file.filename}") continue if add_line_numbers_to_hunks: full_extended_patch = decouple_and_convert_to_hunks_with_lines_numbers(extended_patch, file) else: extended_patch = extended_patch.replace('\n@@ ', '\n\n@@ ') # add extra line before each hunk full_extended_patch = f"\n\n## File: '{file.filename.strip()}'\n\n{extended_patch.strip()}\n" # add AI-summary metadata to the patch if file.ai_file_summary and get_settings().get("config.enable_ai_metadata", False): full_extended_patch = add_ai_summary_top_patch(file, full_extended_patch) patch_tokens = token_handler.count_tokens(full_extended_patch) file.tokens = patch_tokens total_tokens += patch_tokens patches_extended_tokens.append(patch_tokens) patches_extended.append(full_extended_patch) return patches_extended, total_tokens, patches_extended_tokens def pr_generate_compressed_diff(top_langs: list, token_handler: TokenHandler, model: str, convert_hunks_to_line_numbers: bool, large_pr_handling: bool) -> Tuple[list, list, list, list, dict, list]: deleted_files_list = [] # sort each one of the languages in top_langs by the number of tokens in the diff sorted_files = [] for lang in top_langs: sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True)) # generate patches for each file, and count tokens file_dict = {} for file in sorted_files: original_file_content_str = file.base_file new_file_content_str = file.head_file patch = file.patch if not patch: continue # removing delete-only hunks patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename, file.edit_type) if patch is None: if file.filename not in deleted_files_list: deleted_files_list.append(file.filename) continue if convert_hunks_to_line_numbers: patch = decouple_and_convert_to_hunks_with_lines_numbers(patch, file) ## add AI-summary metadata to the patch (disabled, since we are in the compressed diff) # if file.ai_file_summary and get_settings().config.get('config.is_auto_command', False): # patch = add_ai_summary_top_patch(file, patch) new_patch_tokens = token_handler.count_tokens(patch) file_dict[file.filename] = {'patch': patch, 'tokens': new_patch_tokens, 'edit_type': file.edit_type} max_tokens_model = get_max_tokens(model) # first iteration files_in_patches_list = [] remaining_files_list = [file.filename for file in sorted_files] patches_list =[] total_tokens_list = [] total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers, file_dict, max_tokens_model, remaining_files_list, token_handler) patches_list.append(patches) total_tokens_list.append(total_tokens) files_in_patches_list.append(files_in_patch_list) # additional iterations (if needed) if large_pr_handling: NUMBER_OF_ALLOWED_ITERATIONS = get_settings().pr_description.max_ai_calls - 1 # one more call is to summarize for i in range(NUMBER_OF_ALLOWED_ITERATIONS-1): if remaining_files_list: total_tokens, patches, remaining_files_list, files_in_patch_list = generate_full_patch(convert_hunks_to_line_numbers, file_dict, max_tokens_model, remaining_files_list, token_handler) if patches: patches_list.append(patches) total_tokens_list.append(total_tokens) files_in_patches_list.append(files_in_patch_list) else: break return patches_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list def generate_full_patch(convert_hunks_to_line_numbers, file_dict, max_tokens_model,remaining_files_list_prev, token_handler): total_tokens = token_handler.prompt_tokens # initial tokens patches = [] remaining_files_list_new = [] files_in_patch_list = [] for filename, data in file_dict.items(): if filename not in remaining_files_list_prev: continue patch = data['patch'] new_patch_tokens = data['tokens'] edit_type = data['edit_type'] # Hard Stop, no more tokens if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD: get_logger().warning(f"File was fully skipped, no more tokens: {filename}.") continue # If the patch is too large, just show the file name if total_tokens + new_patch_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: # Current logic is to skip the patch if it's too large # TODO: Option for alternative logic to remove hunks from the patch to reduce the number of tokens # until we meet the requirements if get_settings().config.verbosity_level >= 2: get_logger().warning(f"Patch too large, skipping it: '{filename}'") remaining_files_list_new.append(filename) continue if patch: if not convert_hunks_to_line_numbers: patch_final = f"\n\n## File: '{filename.strip()}'\n\n{patch.strip()}\n" else: patch_final = "\n\n" + patch.strip() patches.append(patch_final) total_tokens += token_handler.count_tokens(patch_final) files_in_patch_list.append(filename) if get_settings().config.verbosity_level >= 2: get_logger().info(f"Tokens: {total_tokens}, last filename: {filename}") return total_tokens, patches, remaining_files_list_new, files_in_patch_list async def retry_with_fallback_models(f: Callable, model_type: ModelType = ModelType.REGULAR): all_models = _get_all_models(model_type) all_deployments = _get_all_deployments(all_models) # try each (model, deployment_id) pair until one is successful, otherwise raise exception for i, (model, deployment_id) in enumerate(zip(all_models, all_deployments)): try: get_logger().debug( f"Generating prediction with {model}" f"{(' from deployment ' + deployment_id) if deployment_id else ''}" ) get_settings().set("openai.deployment_id", deployment_id) return await f(model) except Exception as e: get_logger().warning( f"Failed to generate prediction with {model}", artifact={"error": e}, ) if i == len(all_models) - 1: # If it's the last iteration raise Exception(f"Failed to generate prediction with any model of {all_models}") from e def _get_all_models(model_type: ModelType = ModelType.REGULAR) -> List[str]: if model_type == ModelType.WEAK: model = get_model('model_weak') elif model_type == ModelType.REASONING: model = get_model('model_reasoning') elif model_type == ModelType.REGULAR: model = get_settings().config.model else: model = get_settings().config.model fallback_models = get_settings().config.fallback_models if not isinstance(fallback_models, list): fallback_models = [m.strip() for m in fallback_models.split(",")] all_models = [model] + fallback_models return all_models def _get_all_deployments(all_models: List[str]) -> List[str]: deployment_id = get_settings().get("openai.deployment_id", None) fallback_deployments = get_settings().get("openai.fallback_deployments", []) if not isinstance(fallback_deployments, list) and fallback_deployments: fallback_deployments = [d.strip() for d in fallback_deployments.split(",")] if fallback_deployments: all_deployments = [deployment_id] + fallback_deployments if len(all_deployments) < len(all_models): raise ValueError(f"The number of deployments ({len(all_deployments)}) " f"is less than the number of models ({len(all_models)})") else: all_deployments = [deployment_id] * len(all_models) return all_deployments def get_pr_multi_diffs(git_provider: GitProvider, token_handler: TokenHandler, model: str, max_calls: int = 5, add_line_numbers: bool = True) -> List[str]: """ Retrieves the diff files from a Git provider, sorts them by main language, and generates patches for each file. The patches are split into multiple groups based on the maximum number of tokens allowed for the given model. Args: git_provider (GitProvider): An object that provides access to Git provider APIs. token_handler (TokenHandler): An object that handles tokens in the context of a pull request. model (str): The name of the model. max_calls (int, optional): The maximum number of calls to retrieve diff files. Defaults to 5. Returns: List[str]: A list of final diff strings, split into multiple groups based on the maximum number of tokens allowed for the given model. Raises: RateLimitExceededException: If the rate limit for the Git provider API is exceeded. """ try: diff_files = git_provider.get_diff_files() except RateLimitExceededException as e: get_logger().error(f"Rate limit exceeded for git provider API. original message {e}") raise # Sort files by main language pr_languages = sort_files_by_main_languages(git_provider.get_languages(), diff_files) # Get the maximum number of extra lines before and after the patch PATCH_EXTRA_LINES_BEFORE = get_settings().config.patch_extra_lines_before PATCH_EXTRA_LINES_AFTER = get_settings().config.patch_extra_lines_after PATCH_EXTRA_LINES_BEFORE = cap_and_log_extra_lines(PATCH_EXTRA_LINES_BEFORE, "before") PATCH_EXTRA_LINES_AFTER = cap_and_log_extra_lines(PATCH_EXTRA_LINES_AFTER, "after") # try first a single run with standard diff string, with patch extension, and no deletions patches_extended, total_tokens, patches_extended_tokens = pr_generate_extended_diff( pr_languages, token_handler, add_line_numbers_to_hunks=add_line_numbers, patch_extra_lines_before=PATCH_EXTRA_LINES_BEFORE, patch_extra_lines_after=PATCH_EXTRA_LINES_AFTER) # if we are under the limit, return the full diff if total_tokens + OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD < get_max_tokens(model): return ["\n".join(patches_extended)] if patches_extended else [] # Sort files within each language group by tokens in descending order sorted_files = [] for lang in pr_languages: sorted_files.extend(sorted(lang['files'], key=lambda x: x.tokens, reverse=True)) patches = [] final_diff_list = [] total_tokens = token_handler.prompt_tokens call_number = 1 for file in sorted_files: if call_number > max_calls: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Reached max calls ({max_calls})") break original_file_content_str = file.base_file new_file_content_str = file.head_file patch = file.patch if not patch: continue # Remove delete-only hunks patch = handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file.filename, file.edit_type) if patch is None: continue # Add line numbers and metadata to the patch if add_line_numbers: patch = decouple_and_convert_to_hunks_with_lines_numbers(patch, file) else: patch = f"\n\n## File: '{file.filename.strip()}'\n\n{patch.strip()}\n" # add AI-summary metadata to the patch if file.ai_file_summary and get_settings().get("config.enable_ai_metadata", False): patch = add_ai_summary_top_patch(file, patch) new_patch_tokens = token_handler.count_tokens(patch) if patch and (token_handler.prompt_tokens + new_patch_tokens) > get_max_tokens( model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: if get_settings().config.get('large_patch_policy', 'skip') == 'skip': get_logger().warning(f"Patch too large, skipping: {file.filename}") continue elif get_settings().config.get('large_patch_policy') == 'clip': delta_tokens = get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD - token_handler.prompt_tokens patch_clipped = clip_tokens(patch, delta_tokens, delete_last_line=True, num_input_tokens=new_patch_tokens) new_patch_tokens = token_handler.count_tokens(patch_clipped) if patch_clipped and (token_handler.prompt_tokens + new_patch_tokens) > get_max_tokens( model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD: get_logger().warning(f"Patch too large, skipping: {file.filename}") continue else: get_logger().info(f"Clipped large patch for file: {file.filename}") patch = patch_clipped else: get_logger().warning(f"Patch too large, skipping: {file.filename}") continue if patch and (total_tokens + new_patch_tokens > get_max_tokens(model) - OUTPUT_BUFFER_TOKENS_SOFT_THRESHOLD): final_diff = "\n".join(patches) final_diff_list.append(final_diff) patches = [] total_tokens = token_handler.prompt_tokens call_number += 1 if call_number > max_calls: # avoid creating new patches if get_settings().config.verbosity_level >= 2: get_logger().info(f"Reached max calls ({max_calls})") break if get_settings().config.verbosity_level >= 2: get_logger().info(f"Call number: {call_number}") if patch: patches.append(patch) total_tokens += new_patch_tokens if get_settings().config.verbosity_level >= 2: get_logger().info(f"Tokens: {total_tokens}, last filename: {file.filename}") # Add the last chunk if patches: final_diff = "\n".join(patches) final_diff_list.append(final_diff.strip()) return final_diff_list def add_ai_metadata_to_diff_files(git_provider, pr_description_files): """ Adds AI metadata to the diff files based on the PR description files (FilePatchInfo.ai_file_summary). """ try: if not pr_description_files: get_logger().warning(f"PR description files are empty.") return available_files = {pr_file['full_file_name'].strip(): pr_file for pr_file in pr_description_files} diff_files = git_provider.get_diff_files() found_any_match = False for file in diff_files: filename = file.filename.strip() if filename in available_files: file.ai_file_summary = available_files[filename] found_any_match = True if not found_any_match: get_logger().error(f"Failed to find any matching files between PR description and diff files.", artifact={"pr_description_files": pr_description_files}) except Exception as e: get_logger().error(f"Failed to add AI metadata to diff files: {e}", artifact={"traceback": traceback.format_exc()}) def add_ai_summary_top_patch(file, full_extended_patch): try: # below every instance of '## File: ...' in the patch, add the ai-summary metadata full_extended_patch_lines = full_extended_patch.split("\n") for i, line in enumerate(full_extended_patch_lines): if line.startswith("## File:") or line.startswith("## file:"): full_extended_patch_lines.insert(i + 1, f"### AI-generated changes summary:\n{file.ai_file_summary['long_summary']}") full_extended_patch = "\n".join(full_extended_patch_lines) return full_extended_patch # if no '## File: ...' was found return full_extended_patch except Exception as e: get_logger().error(f"Failed to add AI summary to the top of the patch: {e}", artifact={"traceback": traceback.format_exc()}) return full_extended_patch ================================================ FILE: pr_agent/algo/token_handler.py ================================================ from threading import Lock from math import ceil import re from jinja2 import Environment, StrictUndefined from tiktoken import encoding_for_model, get_encoding from pr_agent.config_loader import get_settings from pr_agent.log import get_logger class ModelTypeValidator: @staticmethod def is_openai_model(model_name: str) -> bool: return 'gpt' in model_name or re.match(r"^o[1-9](-mini|-preview)?$", model_name) @staticmethod def is_anthropic_model(model_name: str) -> bool: return 'claude' in model_name class TokenEncoder: _encoder_instance = None _model = None _lock = Lock() # Create a lock object @classmethod def get_token_encoder(cls): model = get_settings().config.model if cls._encoder_instance is None or model != cls._model: # Check without acquiring the lock for performance with cls._lock: # Lock acquisition to ensure thread safety if cls._encoder_instance is None or model != cls._model: cls._model = model try: cls._encoder_instance = encoding_for_model(cls._model) if "gpt" in cls._model else get_encoding( "o200k_base") except: cls._encoder_instance = get_encoding("o200k_base") return cls._encoder_instance class TokenHandler: """ A class for handling tokens in the context of a pull request. Attributes: - encoder: An object of the encoding_for_model class from the tiktoken module. Used to encode strings and count the number of tokens in them. - limit: The maximum number of tokens allowed for the given model, as defined in the MAX_TOKENS dictionary in the pr_agent.algo module. - prompt_tokens: The number of tokens in the system and user strings, as calculated by the _get_system_user_tokens method. """ # Constants CLAUDE_MODEL = "claude-3-7-sonnet-20250219" CLAUDE_MAX_CONTENT_SIZE = 9_000_000 # Maximum allowed content size (9MB) for Claude API def __init__(self, pr=None, vars: dict = {}, system="", user=""): """ Initializes the TokenHandler object. Args: - pr: The pull request object. - vars: A dictionary of variables. - system: The system string. - user: The user string. """ self.encoder = TokenEncoder.get_token_encoder() if pr is not None: self.prompt_tokens = self._get_system_user_tokens(pr, self.encoder, vars, system, user) def _get_system_user_tokens(self, pr, encoder, vars: dict, system, user): """ Calculates the number of tokens in the system and user strings. Args: - pr: The pull request object. - encoder: An object of the encoding_for_model class from the tiktoken module. - vars: A dictionary of variables. - system: The system string. - user: The user string. Returns: The sum of the number of tokens in the system and user strings. """ try: environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(system).render(vars) user_prompt = environment.from_string(user).render(vars) system_prompt_tokens = len(encoder.encode(system_prompt)) user_prompt_tokens = len(encoder.encode(user_prompt)) return system_prompt_tokens + user_prompt_tokens except Exception as e: get_logger().error(f"Error in _get_system_user_tokens: {e}") return 0 def _calc_claude_tokens(self, patch: str) -> int: try: import anthropic from pr_agent.algo import MAX_TOKENS client = anthropic.Anthropic(api_key=get_settings(use_context=False).get('anthropic.key')) max_tokens = MAX_TOKENS[get_settings().config.model] if len(patch.encode('utf-8')) > self.CLAUDE_MAX_CONTENT_SIZE: get_logger().warning( "Content too large for Anthropic token counting API, falling back to local tokenizer" ) return max_tokens response = client.messages.count_tokens( model=self.CLAUDE_MODEL, system="system", messages=[{ "role": "user", "content": patch }], ) return response.input_tokens except Exception as e: get_logger().error(f"Error in Anthropic token counting: {e}") return max_tokens def _apply_estimation_factor(self, model_name: str, default_estimate: int) -> int: factor = 1 + get_settings().get('config.model_token_count_estimate_factor', 0) get_logger().warning(f"{model_name}'s token count cannot be accurately estimated. Using factor of {factor}") return ceil(factor * default_estimate) def _get_token_count_by_model_type(self, patch: str, default_estimate: int) -> int: """ Get token count based on model type. Args: patch: The text to count tokens for. default_estimate: The default token count estimate. Returns: int: The calculated token count. """ model_name = get_settings().config.model.lower() if ModelTypeValidator.is_openai_model(model_name) and get_settings(use_context=False).get('openai.key'): return default_estimate if ModelTypeValidator.is_anthropic_model(model_name) and get_settings(use_context=False).get('anthropic.key'): return self._calc_claude_tokens(patch) return self._apply_estimation_factor(model_name, default_estimate) def count_tokens(self, patch: str, force_accurate: bool = False) -> int: """ Counts the number of tokens in a given patch string. Args: - patch: The patch string. - force_accurate: If True, uses a more precise calculation method. Returns: The number of tokens in the patch string. """ encoder_estimate = len(self.encoder.encode(patch, disallowed_special=())) # If an estimate is enough (for example, in cases where the maximal allowed tokens is way below the known limits), return it. if not force_accurate: return encoder_estimate return self._get_token_count_by_model_type(patch, encoder_estimate) ================================================ FILE: pr_agent/algo/types.py ================================================ from dataclasses import dataclass from enum import Enum from typing import Optional class EDIT_TYPE(Enum): ADDED = 1 DELETED = 2 MODIFIED = 3 RENAMED = 4 UNKNOWN = 5 @dataclass class FilePatchInfo: base_file: str head_file: str patch: str filename: str tokens: int = -1 edit_type: EDIT_TYPE = EDIT_TYPE.UNKNOWN old_filename: str = None num_plus_lines: int = -1 num_minus_lines: int = -1 language: Optional[str] = None ai_file_summary: str = None ================================================ FILE: pr_agent/algo/utils.py ================================================ from __future__ import annotations import ast import copy import difflib import hashlib import html import json import os import re import sys import textwrap import time import traceback from datetime import datetime from enum import Enum from importlib.metadata import PackageNotFoundError, version from typing import Any, List, Tuple, TypedDict import html2text import requests import yaml from pydantic import BaseModel from starlette_context import context from pr_agent.algo import MAX_TOKENS from pr_agent.algo.git_patch_processing import extract_hunk_lines_from_patch from pr_agent.algo.token_handler import TokenEncoder from pr_agent.algo.types import FilePatchInfo from pr_agent.config_loader import get_settings, global_settings from pr_agent.log import get_logger def get_model(model_type: str = "model_weak") -> str: if model_type == "model_weak" and get_settings().get("config.model_weak"): return get_settings().config.model_weak elif model_type == "model_reasoning" and get_settings().get("config.model_reasoning"): return get_settings().config.model_reasoning return get_settings().config.model class Range(BaseModel): line_start: int # should be 0-indexed line_end: int column_start: int = -1 column_end: int = -1 class ModelType(str, Enum): REGULAR = "regular" WEAK = "weak" REASONING = "reasoning" class TodoItem(TypedDict): relevant_file: str line_range: Tuple[int, int] content: str class PRReviewHeader(str, Enum): REGULAR = "## PR Reviewer Guide" INCREMENTAL = "## Incremental PR Reviewer Guide" class ReasoningEffort(str, Enum): XHIGH = "xhigh" HIGH = "high" MEDIUM = "medium" LOW = "low" MINIMAL = "minimal" NONE = "none" class PRDescriptionHeader(str, Enum): DIAGRAM_WALKTHROUGH = "Diagram Walkthrough" FILE_WALKTHROUGH = "File Walkthrough" def get_setting(key: str) -> Any: try: key = key.upper() return context.get("settings", global_settings).get(key, global_settings.get(key, None)) except Exception: return global_settings.get(key, None) def emphasize_header(text: str, only_markdown=False, reference_link=None) -> str: try: # Finding the position of the first occurrence of ": " colon_position = text.find(": ") # Splitting the string and wrapping the first part in tags if colon_position != -1: # Everything before the colon (inclusive) is wrapped in tags if only_markdown: if reference_link: transformed_string = f"[**{text[:colon_position + 1]}**]({reference_link})\n" + text[colon_position + 1:] else: transformed_string = f"**{text[:colon_position + 1]}**\n" + text[colon_position + 1:] else: if reference_link: transformed_string = f"{text[:colon_position + 1]}
" + text[colon_position + 1:] else: transformed_string = "" + text[:colon_position + 1] + "" +'
' + text[colon_position + 1:] else: # If there's no ": ", return the original string transformed_string = text return transformed_string except Exception as e: get_logger().exception(f"Failed to emphasize header: {e}") return text def unique_strings(input_list: List[str]) -> List[str]: if not input_list or not isinstance(input_list, list): return input_list seen = set() unique_list = [] for item in input_list: if item not in seen: unique_list.append(item) seen.add(item) return unique_list def convert_to_markdown_v2(output_data: dict, gfm_supported: bool = True, incremental_review=None, git_provider=None, files=None) -> str: """ Convert a dictionary of data into markdown format. Args: output_data (dict): A dictionary containing data to be converted to markdown format. Returns: str: The markdown formatted text generated from the input dictionary. """ emojis = { "Can be split": "🔀", "Key issues to review": "⚡", "Recommended focus areas for review": "⚡", "Score": "🏅", "Relevant tests": "🧪", "Focused PR": "✨", "Relevant ticket": "🎫", "Security concerns": "🔒", "Todo sections": "📝", "Insights from user's answers": "📝", "Code feedback": "🤖", "Estimated effort to review [1-5]": "⏱️", "Contribution time cost estimate": "⏳", "Ticket compliance check": "🎫", } markdown_text = "" if not incremental_review: markdown_text += f"{PRReviewHeader.REGULAR.value} 🔍\n\n" else: markdown_text += f"{PRReviewHeader.INCREMENTAL.value} 🔍\n\n" markdown_text += f"⏮️ Review for commits since previous PR-Agent review {incremental_review}.\n\n" if not output_data or not output_data.get('review', {}): return "" if get_settings().get("pr_reviewer.enable_intro_text", False): markdown_text += f"Here are some key observations to aid the review process:\n\n" if gfm_supported: markdown_text += "\n" todo_summary = output_data['review'].pop('todo_summary', '') for key, value in output_data['review'].items(): if value is None or value == '' or value == {} or value == []: if key.lower() not in ['can_be_split', 'key_issues_to_review']: continue key_nice = key.replace('_', ' ').capitalize() emoji = emojis.get(key_nice, "") if 'Estimated effort to review' in key_nice: key_nice = 'Estimated effort to review' value = str(value).strip() if value.isnumeric(): value_int = int(value) else: try: value_int = int(value.split(',')[0]) except ValueError: continue blue_bars = '🔵' * value_int white_bars = '⚪' * (5 - value_int) value = f"{value_int} {blue_bars}{white_bars}" if gfm_supported: markdown_text += f"\n" else: markdown_text += f"### {emoji} {key_nice}: {value}\n\n" elif 'relevant tests' in key_nice.lower(): value = str(value).strip().lower() if gfm_supported: markdown_text += f"\n" else: if is_value_no(value): markdown_text += f'### {emoji} No relevant tests\n\n' else: markdown_text += f"### {emoji} PR contains tests\n\n" elif 'ticket compliance check' in key_nice.lower(): markdown_text = ticket_markdown_logic(emoji, markdown_text, value, gfm_supported) elif 'contribution time cost estimate' in key_nice.lower(): if gfm_supported: markdown_text += f"\n" else: markdown_text += f"### {emoji} Contribution time estimate (best, average, worst case): " markdown_text += f"{value['best_case'].replace('m', ' minutes')} | {value['average_case'].replace('m', ' minutes')} | {value['worst_case'].replace('m', ' minutes')}\n\n" elif 'security concerns' in key_nice.lower(): if gfm_supported: markdown_text += f"\n" else: if is_value_no(value): markdown_text += f'### {emoji} No security concerns identified\n\n' else: markdown_text += f"### {emoji} Security concerns\n\n" value = emphasize_header(value.strip(), only_markdown=True) markdown_text += f"{value}\n\n" elif 'todo sections' in key_nice.lower(): if gfm_supported: markdown_text += "\n" else: if is_value_no(value): markdown_text += f"### ✅ No TODO sections\n\n" else: markdown_todo_items = format_todo_items(value, git_provider, gfm_supported) markdown_text += f"### {emoji} TODO sections\n\n" markdown_text += markdown_todo_items elif 'can be split' in key_nice.lower(): if gfm_supported: markdown_text += f"\n" elif 'key issues to review' in key_nice.lower(): # value is a list of issues if is_value_no(value): if gfm_supported: markdown_text += f"\n" else: markdown_text += f"### {emoji} No major issues detected\n\n" else: issues = value if gfm_supported: markdown_text += f"\n" else: if gfm_supported: markdown_text += f"\n" else: markdown_text += f"### {emoji} {key_nice}: {value}\n\n" if gfm_supported: markdown_text += "
" markdown_text += f"{emoji} {key_nice}: {value}" markdown_text += f"
" if is_value_no(value): markdown_text += f"{emoji} No relevant tests" else: markdown_text += f"{emoji} PR contains tests" markdown_text += f"
{emoji} Contribution time estimate (best, average, worst case): " markdown_text += f"{value['best_case'].replace('m', ' minutes')} | {value['average_case'].replace('m', ' minutes')} | {value['worst_case'].replace('m', ' minutes')}" markdown_text += f"
" if is_value_no(value): markdown_text += f"{emoji} No security concerns identified" else: markdown_text += f"{emoji} Security concerns

\n\n" value = emphasize_header(value.strip()) markdown_text += f"{value}" markdown_text += f"
" if is_value_no(value): markdown_text += f"✅ No TODO sections" else: markdown_todo_items = format_todo_items(value, git_provider, gfm_supported) markdown_text += f"{emoji} TODO sections\n

\n" markdown_text += markdown_todo_items markdown_text += "
" markdown_text += process_can_be_split(emoji, value) markdown_text += f"
" markdown_text += f"{emoji} No major issues detected" markdown_text += f"
" # markdown_text += f"{emoji} {key_nice}

\n\n" markdown_text += f"{emoji} Recommended focus areas for review

\n\n" else: markdown_text += f"### {emoji} Recommended focus areas for review\n\n#### \n" for i, issue in enumerate(issues): try: if not issue or not isinstance(issue, dict): continue relevant_file = issue.get('relevant_file', '').strip() issue_header = issue.get('issue_header', '').strip() if issue_header.lower() == 'possible bug': issue_header = 'Possible Issue' # Make the header less frightening issue_content = issue.get('issue_content', '').strip() start_line = int(str(issue.get('start_line', 0)).strip()) end_line = int(str(issue.get('end_line', 0)).strip()) relevant_lines_str = extract_relevant_lines_str(end_line, files, relevant_file, start_line, dedent=True) if git_provider: reference_link = git_provider.get_line_link(relevant_file, start_line, end_line) else: reference_link = None if gfm_supported: if reference_link is not None and len(reference_link) > 0: if relevant_lines_str: issue_str = f"
{issue_header}\n\n{issue_content}\n\n\n{relevant_lines_str}\n\n
" else: issue_str = f"{issue_header}
{issue_content}" else: issue_str = f"{issue_header}
{issue_content}" else: if reference_link is not None and len(reference_link) > 0: issue_str = f"[**{issue_header}**]({reference_link})\n\n{issue_content}\n\n" else: issue_str = f"**{issue_header}**\n\n{issue_content}\n\n" markdown_text += f"{issue_str}\n\n" except Exception as e: get_logger().exception(f"Failed to process 'Recommended focus areas for review': {e}") if gfm_supported: markdown_text += f"
" markdown_text += f"{emoji} {key_nice}: {value}" markdown_text += f"
\n" return markdown_text def extract_relevant_lines_str(end_line, files, relevant_file, start_line, dedent=False) -> str: """ Finds 'relevant_file' in 'files', and extracts the lines from 'start_line' to 'end_line' string from the file content. """ try: relevant_lines_str = "" if files: files = set_file_languages(files) for file in files: if file.filename.strip() == relevant_file: if not file.head_file: # as a fallback, extract relevant lines directly from patch patch = file.patch get_logger().info(f"No content found in file: '{file.filename}' for 'extract_relevant_lines_str'. Using patch instead") _, selected_lines = extract_hunk_lines_from_patch(patch, file.filename, start_line, end_line,side='right') if not selected_lines: get_logger().error(f"Failed to extract relevant lines from patch: {file.filename}") return "" # filter out '-' lines relevant_lines_str = "" for line in selected_lines.splitlines(): if line.startswith('-'): continue relevant_lines_str += line[1:] + '\n' else: relevant_file_lines = file.head_file.splitlines() relevant_lines_str = "\n".join(relevant_file_lines[start_line - 1:end_line]) if dedent and relevant_lines_str: # Remove the longest leading string of spaces and tabs common to all lines. relevant_lines_str = textwrap.dedent(relevant_lines_str) relevant_lines_str = f"```{file.language}\n{relevant_lines_str}\n```" break return relevant_lines_str except Exception as e: get_logger().exception(f"Failed to extract relevant lines: {e}") return "" def ticket_markdown_logic(emoji, markdown_text, value, gfm_supported) -> str: ticket_compliance_str = "" compliance_emoji = '' # Track compliance levels across all tickets all_compliance_levels = [] if isinstance(value, list): for ticket_analysis in value: try: ticket_url = ticket_analysis.get('ticket_url', '').strip() explanation = '' ticket_compliance_level = '' # Individual ticket compliance fully_compliant_str = ticket_analysis.get('fully_compliant_requirements', '').strip() not_compliant_str = ticket_analysis.get('not_compliant_requirements', '').strip() requires_further_human_verification = ticket_analysis.get('requires_further_human_verification', '').strip() if not fully_compliant_str and not not_compliant_str: get_logger().debug(f"Ticket compliance has no requirements", artifact={'ticket_url': ticket_url}) continue # Calculate individual ticket compliance level if fully_compliant_str: if not_compliant_str: ticket_compliance_level = 'Partially compliant' else: if not requires_further_human_verification: ticket_compliance_level = 'Fully compliant' else: ticket_compliance_level = 'PR Code Verified' elif not_compliant_str: ticket_compliance_level = 'Not compliant' # Store the compliance level for aggregation if ticket_compliance_level: all_compliance_levels.append(ticket_compliance_level) # build compliance string if fully_compliant_str: explanation += f"Compliant requirements:\n\n{fully_compliant_str}\n\n" if not_compliant_str: explanation += f"Non-compliant requirements:\n\n{not_compliant_str}\n\n" if requires_further_human_verification: explanation += f"Requires further human verification:\n\n{requires_further_human_verification}\n\n" ticket_compliance_str += f"\n\n**[{ticket_url.split('/')[-1]}]({ticket_url}) - {ticket_compliance_level}**\n\n{explanation}\n\n" # for debugging if requires_further_human_verification: get_logger().debug(f"Ticket compliance requires further human verification", artifact={'ticket_url': ticket_url, 'requires_further_human_verification': requires_further_human_verification, 'compliance_level': ticket_compliance_level}) except Exception as e: get_logger().exception(f"Failed to process ticket compliance: {e}") continue # Calculate overall compliance level and emoji if all_compliance_levels: if all(level == 'Fully compliant' for level in all_compliance_levels): compliance_level = 'Fully compliant' compliance_emoji = '✅' elif all(level == 'PR Code Verified' for level in all_compliance_levels): compliance_level = 'PR Code Verified' compliance_emoji = '✅' elif any(level == 'Not compliant' for level in all_compliance_levels): # If there's a mix of compliant and non-compliant tickets if any(level in ['Fully compliant', 'PR Code Verified'] for level in all_compliance_levels): compliance_level = 'Partially compliant' compliance_emoji = '🔶' else: compliance_level = 'Not compliant' compliance_emoji = '❌' elif any(level == 'Partially compliant' for level in all_compliance_levels): compliance_level = 'Partially compliant' compliance_emoji = '🔶' else: compliance_level = 'PR Code Verified' compliance_emoji = '✅' # Set extra statistics outside the ticket loop get_settings().set('config.extra_statistics', {'compliance_level': compliance_level}) # editing table row for ticket compliance analysis if gfm_supported: markdown_text += f"\n\n" markdown_text += f"**{emoji} Ticket compliance analysis {compliance_emoji}**\n\n" markdown_text += ticket_compliance_str markdown_text += f"\n" else: markdown_text += f"### {emoji} Ticket compliance analysis {compliance_emoji}\n\n" markdown_text += ticket_compliance_str + "\n\n" return markdown_text def process_can_be_split(emoji, value): try: # key_nice = "Can this PR be split?" key_nice = "Multiple PR themes" markdown_text = "" if not value or isinstance(value, list) and len(value) == 1: value = "No" # markdown_text += f" {emoji} {key_nice}\n\n{value}\n\n\n" # markdown_text += f"### {emoji} No multiple PR themes\n\n" markdown_text += f"{emoji} No multiple PR themes\n\n" else: markdown_text += f"{emoji} {key_nice}

\n\n" for i, split in enumerate(value): title = split.get('title', '') relevant_files = split.get('relevant_files', []) markdown_text += f"
\nSub-PR theme: {title}\n\n" markdown_text += f"___\n\nRelevant files:\n\n" for file in relevant_files: markdown_text += f"- {file}\n" markdown_text += f"___\n\n" markdown_text += f"
\n\n" # markdown_text += f"#### Sub-PR theme: {title}\n\n" # markdown_text += f"Relevant files:\n\n" # for file in relevant_files: # markdown_text += f"- {file}\n" # markdown_text += "\n" # number_of_splits = len(value) # markdown_text += f" {emoji} {key_nice}\n" # for i, split in enumerate(value): # title = split.get('title', '') # relevant_files = split.get('relevant_files', []) # if i == 0: # markdown_text += f"
\nSub-PR theme:
{title}
\n\n" # markdown_text += f"
\n" # markdown_text += f"Relevant files:\n" # markdown_text += f"
    \n" # for file in relevant_files: # markdown_text += f"
  • {file}
  • \n" # markdown_text += f"
\n\n
\n" # else: # markdown_text += f"\n
\nSub-PR theme:
{title}
\n\n" # markdown_text += f"
\n" # markdown_text += f"Relevant files:\n" # markdown_text += f"
    \n" # for file in relevant_files: # markdown_text += f"
  • {file}
  • \n" # markdown_text += f"
\n\n
\n" except Exception as e: get_logger().exception(f"Failed to process can be split: {e}") return "" return markdown_text def parse_code_suggestion(code_suggestion: dict, i: int = 0, gfm_supported: bool = True) -> str: """ Convert a dictionary of data into markdown format. Args: code_suggestion (dict): A dictionary containing data to be converted to markdown format. Returns: str: A string containing the markdown formatted text generated from the input dictionary. """ markdown_text = "" if gfm_supported and 'relevant_line' in code_suggestion: markdown_text += '' for sub_key, sub_value in code_suggestion.items(): try: if sub_key.lower() == 'relevant_file': relevant_file = sub_value.strip('`').strip('"').strip("'") markdown_text += f"" # continue elif sub_key.lower() == 'suggestion': markdown_text += (f"" f"") elif sub_key.lower() == 'relevant_line': markdown_text += f"" sub_value_list = sub_value.split('](') relevant_line = sub_value_list[0].lstrip('`').lstrip('[') if len(sub_value_list) > 1: link = sub_value_list[1].rstrip(')').strip('`') markdown_text += f"" else: markdown_text += f"" markdown_text += "" except Exception as e: get_logger().exception(f"Failed to parse code suggestion: {e}") pass markdown_text += '
relevant file{relevant_file}
{sub_key}      \n\n\n\n{sub_value.strip()}\n\n\n
relevant line{relevant_line}{relevant_line}
' markdown_text += "
" else: for sub_key, sub_value in code_suggestion.items(): if isinstance(sub_key, str): sub_key = sub_key.rstrip() if isinstance(sub_value,str): sub_value = sub_value.rstrip() if isinstance(sub_value, dict): # "code example" markdown_text += f" - **{sub_key}:**\n" for code_key, code_value in sub_value.items(): # 'before' and 'after' code code_str = f"```\n{code_value}\n```" code_str_indented = textwrap.indent(code_str, ' ') markdown_text += f" - **{code_key}:**\n{code_str_indented}\n" else: if "relevant_file" in sub_key.lower(): markdown_text += f"\n - **{sub_key}:** {sub_value} \n" else: markdown_text += f" **{sub_key}:** {sub_value} \n" if "relevant_line" not in sub_key.lower(): # nicer presentation # markdown_text = markdown_text.rstrip('\n') + "\\\n" # works for gitlab markdown_text = markdown_text.rstrip('\n') + " \n" # works for gitlab and bitbucker markdown_text += "\n" return markdown_text def try_fix_json(review, max_iter=10, code_suggestions=False): """ Fix broken or incomplete JSON messages and return the parsed JSON data. Args: - review: A string containing the JSON message to be fixed. - max_iter: An integer representing the maximum number of iterations to try and fix the JSON message. - code_suggestions: A boolean indicating whether to try and fix JSON messages with code feedback. Returns: - data: A dictionary containing the parsed JSON data. The function attempts to fix broken or incomplete JSON messages by parsing until the last valid code suggestion. If the JSON message ends with a closing bracket, the function calls the fix_json_escape_char function to fix the message. If code_suggestions is True and the JSON message contains code feedback, the function tries to fix the JSON message by parsing until the last valid code suggestion. The function uses regular expressions to find the last occurrence of "}," with any number of whitespaces or newlines. It tries to parse the JSON message with the closing bracket and checks if it is valid. If the JSON message is valid, the parsed JSON data is returned. If the JSON message is not valid, the last code suggestion is removed and the process is repeated until a valid JSON message is obtained or the maximum number of iterations is reached. If a valid JSON message is not obtained, an error is logged and an empty dictionary is returned. """ if review.endswith("}"): return fix_json_escape_char(review) data = {} if code_suggestions: closing_bracket = "]}" else: closing_bracket = "]}}" if (review.rfind("'Code feedback': [") > 0 or review.rfind('"Code feedback": [') > 0) or \ (review.rfind("'Code suggestions': [") > 0 or review.rfind('"Code suggestions": [') > 0) : last_code_suggestion_ind = [m.end() for m in re.finditer(r"\}\s*,", review)][-1] - 1 valid_json = False iter_count = 0 while last_code_suggestion_ind > 0 and not valid_json and iter_count < max_iter: try: data = json.loads(review[:last_code_suggestion_ind] + closing_bracket) valid_json = True review = review[:last_code_suggestion_ind].strip() + closing_bracket except json.decoder.JSONDecodeError: review = review[:last_code_suggestion_ind] last_code_suggestion_ind = [m.end() for m in re.finditer(r"\}\s*,", review)][-1] - 1 iter_count += 1 if not valid_json: get_logger().error("Unable to decode JSON response from AI") data = {} return data def fix_json_escape_char(json_message=None): """ Fix broken or incomplete JSON messages and return the parsed JSON data. Args: json_message (str): A string containing the JSON message to be fixed. Returns: dict: A dictionary containing the parsed JSON data. Raises: None """ try: result = json.loads(json_message) except Exception as e: # Find the offending character index: idx_to_replace = int(str(e).split(' ')[-1].replace(')', '')) # Remove the offending character: json_message = list(json_message) json_message[idx_to_replace] = ' ' new_message = ''.join(json_message) return fix_json_escape_char(json_message=new_message) return result def convert_str_to_datetime(date_str): """ Convert a string representation of a date and time into a datetime object. Args: date_str (str): A string representation of a date and time in the format '%a, %d %b %Y %H:%M:%S %Z' Returns: datetime: A datetime object representing the input date and time. Example: >>> convert_str_to_datetime('Mon, 01 Jan 2022 12:00:00 UTC') datetime.datetime(2022, 1, 1, 12, 0, 0) """ datetime_format = '%a, %d %b %Y %H:%M:%S %Z' return datetime.strptime(date_str, datetime_format) def load_large_diff(filename, new_file_content_str: str, original_file_content_str: str, show_warning: bool = True) -> str: """ Generate a patch for a modified file by comparing the original content of the file with the new content provided as input. """ if not original_file_content_str and not new_file_content_str: return "" try: original_file_content_str = (original_file_content_str or "").rstrip() + "\n" new_file_content_str = (new_file_content_str or "").rstrip() + "\n" diff = difflib.unified_diff(original_file_content_str.splitlines(keepends=True), new_file_content_str.splitlines(keepends=True)) if get_settings().config.verbosity_level >= 2 and show_warning: get_logger().info(f"File was modified, but no patch was found. Manually creating patch: {filename}.") patch = ''.join(diff) return patch except Exception as e: get_logger().exception(f"Failed to generate patch for file: {filename}") return "" def update_settings_from_args(args: List[str]) -> List[str]: """ Update the settings of the Dynaconf object based on the arguments passed to the function. Args: args: A list of arguments passed to the function. Example args: ['--pr_code_suggestions.extra_instructions="be funny', '--pr_code_suggestions.num_code_suggestions=3'] Returns: None Raises: ValueError: If the argument is not in the correct format. """ other_args = [] if args: for arg in args: arg = arg.strip() if arg.startswith('--'): arg = arg.strip('-').strip() vals = arg.split('=', 1) if len(vals) != 2: if len(vals) > 2: # --extended is a valid argument get_logger().error(f'Invalid argument format: {arg}') other_args.append(arg) continue key, value = _fix_key_value(*vals) get_settings().set(key, value) get_logger().info(f'Updated setting {key} to: "{value}"') else: other_args.append(arg) return other_args def _fix_key_value(key: str, value: str): key = key.strip().upper() value = value.strip() try: value = yaml.safe_load(value) except Exception as e: get_logger().debug(f"Failed to parse YAML for config override {key}={value}", exc_info=e) return key, value def load_yaml(response_text: str, keys_fix_yaml: List[str] = [], first_key="", last_key="") -> dict: response_text_original = copy.deepcopy(response_text) response_text = response_text.strip('\n').removeprefix('yaml').removeprefix('```yaml').rstrip().removesuffix('```') try: data = yaml.safe_load(response_text) except Exception as e: get_logger().warning(f"Initial failure to parse AI prediction: {e}") data = try_fix_yaml(response_text, keys_fix_yaml=keys_fix_yaml, first_key=first_key, last_key=last_key, response_text_original=response_text_original) if not data: get_logger().error(f"Failed to parse AI prediction after fallbacks", artifact={'response_text': response_text}) else: get_logger().info(f"Successfully parsed AI prediction after fallbacks", artifact={'response_text': response_text}) return data def try_fix_yaml(response_text: str, keys_fix_yaml: List[str] = [], first_key="", last_key="", response_text_original="") -> dict: response_text_lines = response_text.split('\n') keys_yaml = ['relevant line:', 'suggestion content:', 'relevant file:', 'existing code:', 'improved code:', 'label:', 'why:', 'suggestion_summary:'] keys_yaml = keys_yaml + keys_fix_yaml # first fallback - try to convert 'relevant line: ...' to relevant line: |-\n ...' response_text_lines_copy = response_text_lines.copy() for i in range(0, len(response_text_lines_copy)): for key in keys_yaml: if key in response_text_lines_copy[i] and not '|' in response_text_lines_copy[i]: response_text_lines_copy[i] = response_text_lines_copy[i].replace(f'{key}', f'{key} |\n ') try: data = yaml.safe_load('\n'.join(response_text_lines_copy)) get_logger().info(f"Successfully parsed AI prediction after adding |-\n") return data except: pass # 1.5 fallback - try to convert '|' to '|2'. Will solve cases of indent decreasing during the code response_text_copy = copy.deepcopy(response_text) response_text_copy = response_text_copy.replace('|\n', '|2\n') try: data = yaml.safe_load(response_text_copy) get_logger().info(f"Successfully parsed AI prediction after replacing | with |2") return data except: # if it fails, we can try to add spaces to the lines that are not indented properly, and contain '}'. response_text_lines_copy = response_text_copy.split('\n') for i in range(0, len(response_text_lines_copy)): initial_space = len(response_text_lines_copy[i]) - len(response_text_lines_copy[i].lstrip()) if initial_space == 2 and '|2' not in response_text_lines_copy[i] and '}' in response_text_lines_copy[i]: response_text_lines_copy[i] = ' ' + response_text_lines_copy[i].lstrip() try: data = yaml.safe_load('\n'.join(response_text_lines_copy)) get_logger().info(f"Successfully parsed AI prediction after replacing | with |2 and adding spaces") return data except: pass # second fallback - try to extract only range from first ```yaml to the last ``` snippet_pattern = r'```yaml([\s\S]*?)```(?=\s*$|")' snippet = re.search(snippet_pattern, '\n'.join(response_text_lines_copy)) if not snippet: snippet = re.search(snippet_pattern, response_text_original) # before we removed the "```" if snippet: snippet_text = snippet.group() try: data = yaml.safe_load(snippet_text.removeprefix('```yaml').rstrip('`')) get_logger().info(f"Successfully parsed AI prediction after extracting yaml snippet") return data except: pass # third fallback - try to remove leading and trailing curly brackets response_text_copy = response_text.strip().rstrip().removeprefix('{').removesuffix('}').rstrip(':\n') try: data = yaml.safe_load(response_text_copy) get_logger().info(f"Successfully parsed AI prediction after removing curly brackets") return data except: pass # forth fallback - try to extract yaml snippet by 'first_key' and 'last_key' # note that 'last_key' can be in practice a key that is not the last key in the yaml snippet. # it just needs to be some inner key, so we can look for newlines after it if first_key and last_key: index_start = response_text.find(f"\n{first_key}:") if index_start == -1: index_start = response_text.find(f"{first_key}:") index_last_code = response_text.rfind(f"{last_key}:") index_end = response_text.find("\n\n", index_last_code) # look for newlines after last_key if index_end == -1: index_end = len(response_text) response_text_copy = response_text[index_start:index_end].strip().strip('```yaml').strip('`').strip() if response_text_copy: try: data = yaml.safe_load(response_text_copy) get_logger().info(f"Successfully parsed AI prediction after extracting yaml snippet") return data except: pass # fifth fallback - try to remove leading '+' (sometimes added by AI for 'existing code' and 'improved code') response_text_lines_copy = response_text_lines.copy() for i in range(0, len(response_text_lines_copy)): if response_text_lines_copy[i].startswith('+'): response_text_lines_copy[i] = ' ' + response_text_lines_copy[i][1:] try: data = yaml.safe_load('\n'.join(response_text_lines_copy)) get_logger().info(f"Successfully parsed AI prediction after removing leading '+'") return data except: pass # sixth fallback - replace tabs with spaces if '\t' in response_text: response_text_copy = copy.deepcopy(response_text) response_text_copy = response_text_copy.replace('\t', ' ') try: data = yaml.safe_load(response_text_copy) get_logger().info(f"Successfully parsed AI prediction after replacing tabs with spaces") return data except: pass # seventh fallback - add indent for sections of code blocks response_text_copy = copy.deepcopy(response_text) response_text_copy_lines = response_text_copy.split('\n') start_line = -1 improve_sections = ['existing_code:', 'improved_code:', 'response:', 'why:'] describe_sections = ['description:', 'title:', 'changes_diagram:', 'pr_files:', 'pr_ticket:'] for i, line in enumerate(response_text_copy_lines): line_stripped = line.rstrip() if any(key in line_stripped for key in (improve_sections+describe_sections)): start_line = i elif line_stripped.endswith(': |') or line_stripped.endswith(': |-') or line_stripped.endswith(': |2') or any(line_stripped.endswith(key) for key in keys_yaml): start_line = -1 elif start_line != -1: response_text_copy_lines[i] = ' ' + line response_text_copy = '\n'.join(response_text_copy_lines) response_text_copy = response_text_copy.replace(' |\n', ' |2\n') try: data = yaml.safe_load(response_text_copy) get_logger().info(f"Successfully parsed AI prediction after adding indent for sections of code blocks") return data except: pass # eighth fallback - try to remove pipe chars at the root-level dicts response_text_copy = copy.deepcopy(response_text) response_text_copy = response_text_copy.lstrip('|\n') try: data = yaml.safe_load(response_text_copy) get_logger().info(f"Successfully parsed AI prediction after removing pipe chars") return data except: pass # ninth fallback - try to decode the response text with different encodings. GPT-5 can return text that is not utf-8 encoded. encodings_to_try = ['latin-1', 'utf-16'] for encoding in encodings_to_try: try: data = yaml.safe_load(response_text.encode(encoding).decode("utf-8")) if data: get_logger().info(f"Successfully parsed AI prediction after decoding with {encoding} encoding") return data except: pass # # sixth fallback - try to remove last lines # for i in range(1, len(response_text_lines)): # response_text_lines_tmp = '\n'.join(response_text_lines[:-i]) # try: # data = yaml.safe_load(response_text_lines_tmp) # get_logger().info(f"Successfully parsed AI prediction after removing {i} lines") # return data # except: # pass def set_custom_labels(variables, git_provider=None): if not get_settings().config.enable_custom_labels: return labels = get_settings().get('custom_labels', {}) if not labels: # set default labels labels = ['Bug fix', 'Tests', 'Bug fix with tests', 'Enhancement', 'Documentation', 'Other'] labels_list = "\n - ".join(labels) if labels else "" labels_list = f" - {labels_list}" if labels_list else "" variables["custom_labels"] = labels_list return # Set custom labels variables["custom_labels_class"] = "class Label(str, Enum):" counter = 0 labels_minimal_to_labels_dict = {} for k, v in labels.items(): description = "'" + v['description'].strip('\n').replace('\n', '\\n') + "'" # variables["custom_labels_class"] += f"\n {k.lower().replace(' ', '_')} = '{k}' # {description}" variables["custom_labels_class"] += f"\n {k.lower().replace(' ', '_')} = {description}" labels_minimal_to_labels_dict[k.lower().replace(' ', '_')] = k counter += 1 variables["labels_minimal_to_labels_dict"] = labels_minimal_to_labels_dict def get_user_labels(current_labels: List[str] = None): """ Only keep labels that has been added by the user """ try: enable_custom_labels = get_settings().config.get('enable_custom_labels', False) custom_labels = get_settings().get('custom_labels', []) if current_labels is None: current_labels = [] user_labels = [] for label in current_labels: if label.lower() in ['bug fix', 'tests', 'enhancement', 'documentation', 'other']: continue if enable_custom_labels: if label in custom_labels: continue user_labels.append(label) if user_labels: get_logger().debug(f"Keeping user labels: {user_labels}") except Exception as e: get_logger().exception(f"Failed to get user labels: {e}") return current_labels return user_labels def get_max_tokens(model): """ Get the maximum number of tokens allowed for a model. logic: (1) If the model is in './pr_agent/algo/__init__.py', use the value from there. (2) else, the user needs to define explicitly 'config.custom_model_max_tokens' For both cases, we further limit the number of tokens to 'config.max_model_tokens' if it is set. This aims to improve the algorithmic quality, as the AI model degrades in performance when the input is too long. """ settings = get_settings() if model in MAX_TOKENS: max_tokens_model = MAX_TOKENS[model] elif settings.config.custom_model_max_tokens > 0: max_tokens_model = settings.config.custom_model_max_tokens else: get_logger().error(f"Model {model} is not defined in MAX_TOKENS in ./pr_agent/algo/__init__.py and no custom_model_max_tokens is set") raise Exception(f"Ensure {model} is defined in MAX_TOKENS in ./pr_agent/algo/__init__.py or set a positive value for it in config.custom_model_max_tokens") if settings.config.max_model_tokens and settings.config.max_model_tokens > 0: max_tokens_model = min(settings.config.max_model_tokens, max_tokens_model) return max_tokens_model def clip_tokens(text: str, max_tokens: int, add_three_dots=True, num_input_tokens=None, delete_last_line=False) -> str: """ Clip the number of tokens in a string to a maximum number of tokens. This function limits text to a specified token count by calculating the approximate character-to-token ratio and truncating the text accordingly. A safety factor of 0.9 (10% reduction) is applied to ensure the result stays within the token limit. Args: text (str): The string to clip. If empty or None, returns the input unchanged. max_tokens (int): The maximum number of tokens allowed in the string. If negative, returns an empty string. add_three_dots (bool, optional): Whether to add "\\n...(truncated)" at the end of the clipped text to indicate truncation. Defaults to True. num_input_tokens (int, optional): Pre-computed number of tokens in the input text. If provided, skips token encoding step for efficiency. If None, tokens will be counted using TokenEncoder. Defaults to None. delete_last_line (bool, optional): Whether to remove the last line from the clipped content before adding truncation indicator. Useful for ensuring clean breaks at line boundaries. Defaults to False. Returns: str: The clipped string. Returns original text if: - Text is empty/None - Token count is within limit - An error occurs during processing Returns empty string if max_tokens <= 0. Examples: Basic usage: >>> text = "This is a sample text that might be too long" >>> result = clip_tokens(text, max_tokens=10) >>> print(result) This is a sample... (truncated) Without truncation indicator: >>> result = clip_tokens(text, max_tokens=10, add_three_dots=False) >>> print(result) This is a sample With pre-computed token count: >>> result = clip_tokens(text, max_tokens=5, num_input_tokens=15) >>> print(result) This... (truncated) With line deletion: >>> multiline_text = "Line 1\\nLine 2\\nLine 3" >>> result = clip_tokens(multiline_text, max_tokens=3, delete_last_line=True) >>> print(result) Line 1 Line 2 ... (truncated) Notes: The function uses a safety factor of 0.9 (10% reduction) to ensure the result stays within the token limit, as character-to-token ratios can vary. If token encoding fails, the original text is returned with a warning logged. """ if not text: return text try: if num_input_tokens is None: encoder = TokenEncoder.get_token_encoder() num_input_tokens = len(encoder.encode(text)) if num_input_tokens <= max_tokens: return text if max_tokens < 0: return "" # calculate the number of characters to keep num_chars = len(text) chars_per_token = num_chars / num_input_tokens factor = 0.9 # reduce by 10% to be safe num_output_chars = int(factor * chars_per_token * max_tokens) # clip the text if num_output_chars > 0: clipped_text = text[:num_output_chars] if delete_last_line: clipped_text = clipped_text.rsplit('\n', 1)[0] if add_three_dots: clipped_text += "\n...(truncated)" else: # if the text is empty clipped_text = "" return clipped_text except Exception as e: get_logger().warning(f"Failed to clip tokens: {e}") return text def replace_code_tags(text): """ Replace odd instances of ` with and even instances of ` with """ text = html.escape(text) parts = text.split('`') for i in range(1, len(parts), 2): parts[i] = '' + parts[i] + '' return ''.join(parts) def find_line_number_of_relevant_line_in_file(diff_files: List[FilePatchInfo], relevant_file: str, relevant_line_in_file: str, absolute_position: int = None) -> Tuple[int, int]: position = -1 if absolute_position is None: absolute_position = -1 re_hunk_header = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") if not diff_files: return position, absolute_position for file in diff_files: if file.filename and (file.filename.strip() == relevant_file): patch = file.patch patch_lines = patch.splitlines() delta = 0 start1, size1, start2, size2 = 0, 0, 0, 0 if absolute_position != -1: # matching absolute to relative for i, line in enumerate(patch_lines): # new hunk if line.startswith('@@'): delta = 0 match = re_hunk_header.match(line) start1, size1, start2, size2 = map(int, match.groups()[:4]) elif not line.startswith('-'): delta += 1 # absolute_position_curr = start2 + delta - 1 if absolute_position_curr == absolute_position: position = i break else: # try to find the line in the patch using difflib, with some margin of error matches_difflib: list[str | Any] = difflib.get_close_matches(relevant_line_in_file, patch_lines, n=3, cutoff=0.93) if len(matches_difflib) == 1 and matches_difflib[0].startswith('+'): relevant_line_in_file = matches_difflib[0] for i, line in enumerate(patch_lines): if line.startswith('@@'): delta = 0 match = re_hunk_header.match(line) start1, size1, start2, size2 = map(int, match.groups()[:4]) elif not line.startswith('-'): delta += 1 if relevant_line_in_file in line and line[0] != '-': position = i absolute_position = start2 + delta - 1 break if position == -1 and relevant_line_in_file[0] == '+': no_plus_line = relevant_line_in_file[1:].lstrip() for i, line in enumerate(patch_lines): if line.startswith('@@'): delta = 0 match = re_hunk_header.match(line) start1, size1, start2, size2 = map(int, match.groups()[:4]) elif not line.startswith('-'): delta += 1 if no_plus_line in line and line[0] != '-': # The model might add a '+' to the beginning of the relevant_line_in_file even if originally # it's a context line position = i absolute_position = start2 + delta - 1 break return position, absolute_position def get_rate_limit_status(github_token) -> dict: GITHUB_API_URL = get_settings(use_context=False).get("GITHUB.BASE_URL", "https://api.github.com").rstrip("/") # "https://api.github.com" # GITHUB_API_URL = "https://api.github.com" RATE_LIMIT_URL = f"{GITHUB_API_URL}/rate_limit" HEADERS = { "Accept": "application/vnd.github.v3+json", "Authorization": f"token {github_token}" } response = requests.get(RATE_LIMIT_URL, headers=HEADERS) try: rate_limit_info = response.json() if rate_limit_info.get('message') == 'Rate limiting is not enabled.': # for github enterprise return {'resources': {}} response.raise_for_status() # Check for HTTP errors except: # retry time.sleep(0.1) response = requests.get(RATE_LIMIT_URL, headers=HEADERS) return response.json() return rate_limit_info def validate_rate_limit_github(github_token, installation_id=None, threshold=0.1) -> bool: try: rate_limit_status = get_rate_limit_status(github_token) if installation_id: get_logger().debug(f"installation_id: {installation_id}, Rate limit status: {rate_limit_status['rate']}") # validate that the rate limit is not exceeded # validate that the rate limit is not exceeded for key, value in rate_limit_status['resources'].items(): if value['remaining'] < value['limit'] * threshold: get_logger().error(f"key: {key}, value: {value}") return False return True except Exception as e: get_logger().error(f"Error in rate limit {e}", artifact={"traceback": traceback.format_exc()}) return True def validate_and_await_rate_limit(github_token): try: rate_limit_status = get_rate_limit_status(github_token) # validate that the rate limit is not exceeded for key, value in rate_limit_status['resources'].items(): if value['remaining'] < value['limit'] // 80: get_logger().error(f"key: {key}, value: {value}") sleep_time_sec = value['reset'] - datetime.now().timestamp() sleep_time_hour = sleep_time_sec / 3600.0 get_logger().error(f"Rate limit exceeded. Sleeping for {sleep_time_hour} hours") if sleep_time_sec > 0: time.sleep(sleep_time_sec + 1) rate_limit_status = get_rate_limit_status(github_token) return rate_limit_status except: get_logger().error("Error in rate limit") return None def github_action_output(output_data: dict, key_name: str): try: if not get_settings().get('github_action_config.enable_output', False): return key_data = output_data.get(key_name, {}) with open(os.environ['GITHUB_OUTPUT'], 'a') as fh: print(f"{key_name}={json.dumps(key_data, indent=None, ensure_ascii=False)}", file=fh) except Exception as e: get_logger().error(f"Failed to write to GitHub Action output: {e}") return def show_relevant_configurations(relevant_section: str) -> str: skip_keys = ['ai_disclaimer', 'ai_disclaimer_title', 'ANALYTICS_FOLDER', 'secret_provider', "skip_keys", "app_id", "redirect", 'trial_prefix_message', 'no_eligible_message', 'identity_provider', 'ALLOWED_REPOS','APP_NAME'] extra_skip_keys = get_settings().config.get('config.skip_keys', []) if extra_skip_keys: skip_keys.extend(extra_skip_keys) markdown_text = "" markdown_text += "\n
\n
🛠️ Relevant configurations: \n\n" markdown_text +="
These are the relevant [configurations](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml) for this tool:\n\n" markdown_text += f"**[config**]\n```yaml\n\n" for key, value in get_settings().config.items(): if key in skip_keys: continue markdown_text += f"{key}: {value}\n" markdown_text += "\n```\n" markdown_text += f"\n**[{relevant_section}]**\n```yaml\n\n" for key, value in get_settings().get(relevant_section, {}).items(): if key in skip_keys: continue markdown_text += f"{key}: {value}\n" markdown_text += "\n```" markdown_text += "\n
\n" return markdown_text def is_value_no(value): if not value: return True value_str = str(value).strip().lower() if value_str == 'no' or value_str == 'none' or value_str == 'false': return True return False def set_pr_string(repo_name, pr_number): return f"{repo_name}#{pr_number}" def string_to_uniform_number(s: str) -> float: """ Convert a string to a uniform number in the range [0, 1]. The uniform distribution is achieved by the nature of the SHA-256 hash function, which produces a uniformly distributed hash value over its output space. """ # Generate a hash of the string hash_object = hashlib.sha256(s.encode()) # Convert the hash to an integer hash_int = int(hash_object.hexdigest(), 16) # Normalize the integer to the range [0, 1] max_hash_int = 2 ** 256 - 1 uniform_number = float(hash_int) / max_hash_int return uniform_number def process_description(description_full: str) -> Tuple[str, List]: if not description_full: return "", [] # description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value) if PRDescriptionHeader.FILE_WALKTHROUGH.value in description_full: try: # FILE_WALKTHROUGH are presented in a collapsible section in the description regex_pattern = r'\s*\s*

\s*' + re.escape(PRDescriptionHeader.FILE_WALKTHROUGH.value) + r'\s*

\s*
' description_split = re.split(regex_pattern, description_full, maxsplit=1, flags=re.DOTALL) # If the regex pattern is not found, fallback to the previous method if len(description_split) == 1: get_logger().debug("Could not find regex pattern for file walkthrough, falling back to simple split") description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value, 1) except Exception as e: get_logger().warning(f"Failed to split description using regex, falling back to simple split: {e}") description_split = description_full.split(PRDescriptionHeader.FILE_WALKTHROUGH.value, 1) if len(description_split) < 2: get_logger().error("Failed to split description into base and changes walkthrough", artifact={'description': description_full}) return description_full.strip(), [] base_description_str = description_split[0].strip() changes_walkthrough_str = "" files = [] if len(description_split) > 1: changes_walkthrough_str = description_split[1] else: get_logger().debug("No changes walkthrough found") else: base_description_str = description_full.strip() return base_description_str, [] try: if changes_walkthrough_str: # get the end of the table if '\n\n___' in changes_walkthrough_str: end = changes_walkthrough_str.index("\n\n___") elif '\n___' in changes_walkthrough_str: end = changes_walkthrough_str.index("\n___") else: end = len(changes_walkthrough_str) changes_walkthrough_str = changes_walkthrough_str[:end] h = html2text.HTML2Text() h.body_width = 0 # Disable line wrapping # find all the files pattern = r'\s*\s*(
\s*(.*?)(.*?)
)\s*' files_found = re.findall(pattern, changes_walkthrough_str, re.DOTALL) for file_data in files_found: try: if isinstance(file_data, tuple): file_data = file_data[0] pattern = r'
\s*(.*?)\s*
(.*?).*?
\s*
\s*(.*?)\s*(?:
  • |•)(.*?)
  • ' res = re.search(pattern, file_data, re.DOTALL) if not res or res.lastindex != 4: pattern_back = r'
    \s*(.*?)
    (.*?).*?
    \s*
    \s*(.*?)\n\n\s*(.*?)
    ' res = re.search(pattern_back, file_data, re.DOTALL) if not res or res.lastindex != 4: pattern_back = r'
    \s*(.*?)\s*
    (.*?).*?
    \s*
    \s*(.*?)\s*-\s*(.*?)\s*
    ' # looking for hypen ('- ') res = re.search(pattern_back, file_data, re.DOTALL) if res and res.lastindex == 4: short_filename = res.group(1).strip() short_summary = res.group(2).strip() long_filename = res.group(3).strip() if long_filename.endswith('
      '): long_filename = long_filename[:-4].strip() long_summary = res.group(4).strip() long_summary = long_summary.replace('
      *', '\n*').replace('
      ','').replace('\n','
      ') long_summary = h.handle(long_summary).strip() if long_summary.startswith('\\-'): long_summary = "* " + long_summary[2:] elif not long_summary.startswith('*'): long_summary = f"* {long_summary}" files.append({ 'short_file_name': short_filename, 'full_file_name': long_filename, 'short_summary': short_summary, 'long_summary': long_summary }) else: if '...' in file_data: pass # PR with many files. some did not get analyzed else: get_logger().warning(f"Failed to parse description", artifact={'description': file_data}) except Exception as e: get_logger().exception(f"Failed to process description: {e}", artifact={'description': file_data}) except Exception as e: get_logger().exception(f"Failed to process description: {e}") return base_description_str, files def get_version() -> str: # First check pyproject.toml if running directly out of repository if os.path.exists("pyproject.toml"): if sys.version_info >= (3, 11): import tomllib with open("pyproject.toml", "rb") as f: data = tomllib.load(f) if "project" in data and "version" in data["project"]: return data["project"]["version"] else: get_logger().warning("Version not found in pyproject.toml") else: get_logger().warning("Unable to determine local version from pyproject.toml") # Otherwise get the installed pip package version try: return version('pr-agent') except PackageNotFoundError: get_logger().warning("Unable to find package named 'pr-agent'") return "unknown" def set_file_languages(diff_files) -> List[FilePatchInfo]: try: # if the language is already set, do not change it if hasattr(diff_files[0], 'language') and diff_files[0].language: return diff_files # map file extensions to programming languages language_extension_map_org = get_settings().language_extension_map_org extension_to_language = {} for language, extensions in language_extension_map_org.items(): for ext in extensions: extension_to_language[ext] = language for file in diff_files: extension_s = '.' + file.filename.rsplit('.')[-1] language_name = "txt" if extension_s and (extension_s in extension_to_language): language_name = extension_to_language[extension_s] file.language = language_name.lower() except Exception as e: get_logger().exception(f"Failed to set file languages: {e}") return diff_files def format_todo_item(todo_item: TodoItem, git_provider, gfm_supported) -> str: relevant_file = todo_item.get('relevant_file', '').strip() line_number = todo_item.get('line_number', '') content = todo_item.get('content', '') reference_link = git_provider.get_line_link(relevant_file, line_number, line_number) file_ref = f"{relevant_file} [{line_number}]" if reference_link: if gfm_supported: file_ref = f"{file_ref}" else: file_ref = f"[{file_ref}]({reference_link})" if content: return f"{file_ref}: {content.strip()}" else: # if content is empty, return only the file reference return file_ref def format_todo_items(value: list[TodoItem] | TodoItem, git_provider, gfm_supported) -> str: markdown_text = "" MAX_ITEMS = 5 # limit the number of items to display if gfm_supported: if isinstance(value, list): markdown_text += "
        \n" if len(value) > MAX_ITEMS: get_logger().debug(f"Truncating todo items to {MAX_ITEMS} items") value = value[:MAX_ITEMS] for todo_item in value: markdown_text += f"
      • {format_todo_item(todo_item, git_provider, gfm_supported)}
      • \n" markdown_text += "
      \n" else: markdown_text += f"

      {format_todo_item(value, git_provider, gfm_supported)}

      \n" else: if isinstance(value, list): if len(value) > MAX_ITEMS: get_logger().debug(f"Truncating todo items to {MAX_ITEMS} items") value = value[:MAX_ITEMS] for todo_item in value: markdown_text += f"- {format_todo_item(todo_item, git_provider, gfm_supported)}\n" else: markdown_text += f"- {format_todo_item(value, git_provider, gfm_supported)}\n" return markdown_text ================================================ FILE: pr_agent/cli.py ================================================ import argparse import asyncio import os from pr_agent.agent.pr_agent import PRAgent, commands from pr_agent.algo.utils import get_version from pr_agent.config_loader import get_settings from pr_agent.log import get_logger, setup_logger log_level = os.environ.get("LOG_LEVEL", "INFO") setup_logger(log_level) def set_parser(): parser = argparse.ArgumentParser(description='AI based pull request analyzer', usage= """\ Usage: cli.py --pr_url= []. For example: - cli.py --pr_url=... review - cli.py --pr_url=... describe - cli.py --pr_url=... improve - cli.py --pr_url=... ask "write me a poem about this PR" - cli.py --pr_url=... reflect - cli.py --issue_url=... similar_issue - cli.py --pr_url/--issue_url= help_docs [] Supported commands: - review / review_pr - Add a review that includes a summary of the PR and specific suggestions for improvement. - ask / ask_question [question] - Ask a question about the PR. - describe / describe_pr - Modify the PR title and description based on the PR's contents. - improve / improve_code - Suggest improvements to the code in the PR as pull request comments ready to commit. Extended mode ('improve --extended') employs several calls, and provides a more thorough feedback - reflect - Ask the PR author questions about the PR. - update_changelog - Update the changelog based on the PR's contents. - add_docs - generate_labels - help_docs - Ask a question, from either an issue or PR context, on a given repo (current context or a different one) Configuration: To edit any configuration parameter from 'configuration.toml', just add -config_path=. For example: 'python cli.py --pr_url=... review --pr_reviewer.extra_instructions="focus on the file: ..."' """) parser.add_argument('--version', action='version', version=f'pr-agent {get_version()}') parser.add_argument('--pr_url', type=str, help='The URL of the PR to review', default=None) parser.add_argument('--issue_url', type=str, help='The URL of the Issue to review', default=None) parser.add_argument('command', type=str, help='The', choices=commands, default='review') parser.add_argument('rest', nargs=argparse.REMAINDER, default=[]) return parser def run_command(pr_url, command): # Preparing the command run_command_str = f"--pr_url={pr_url} {command.lstrip('/')}" args = set_parser().parse_args(run_command_str.split()) # Run the command. Feedback will appear in GitHub PR comments run(args=args) def run(inargs=None, args=None): parser = set_parser() if not args: args = parser.parse_args(inargs) if not args.pr_url and not args.issue_url: parser.print_help() return command = args.command.lower() get_settings().set("CONFIG.CLI_MODE", True) async def inner(): if args.issue_url: result = await asyncio.create_task(PRAgent().handle_request(args.issue_url, [command] + args.rest)) else: result = await asyncio.create_task(PRAgent().handle_request(args.pr_url, [command] + args.rest)) if get_settings().litellm.get("enable_callbacks", False): # There may be additional events on the event queue from the run above. If there are give them time to complete. get_logger().debug("Waiting for event queue to complete") tasks = [task for task in asyncio.all_tasks() if task is not asyncio.current_task()] if tasks: _, pending = await asyncio.wait(tasks, timeout=30) if pending: get_logger().warning( f"{len(pending)} callback tasks({[task.get_coro() for task in pending]}) did not complete within timeout" ) return result result = asyncio.run(inner()) if not result: parser.print_help() if __name__ == '__main__': run() ================================================ FILE: pr_agent/cli_pip.py ================================================ from pr_agent import cli from pr_agent.config_loader import get_settings def main(): # Fill in the following values provider = "github" # GitHub provider user_token = "..." # GitHub user token openai_key = "..." # OpenAI key pr_url = "..." # PR URL, for example 'https://github.com/Codium-ai/pr-agent/pull/809' command = "/review" # Command to run (e.g. '/review', '/describe', '/ask="What is the purpose of this PR?"') # Setting the configurations get_settings().set("CONFIG.git_provider", provider) get_settings().set("openai.key", openai_key) get_settings().set("github.user_token", user_token) # Run the command. Feedback will appear in GitHub PR comments cli.run_command(pr_url, command) if __name__ == '__main__': main() ================================================ FILE: pr_agent/config_loader.py ================================================ from os.path import abspath, dirname, join from pathlib import Path from typing import Optional from dynaconf import Dynaconf from starlette_context import context PR_AGENT_TOML_KEY = 'pr-agent' current_dir = dirname(abspath(__file__)) dynconf_kwargs = {'core_loaders': [], # DISABLE default loaders, otherwise will load toml files more than once. 'loaders': ['pr_agent.custom_merge_loader', 'dynaconf.loaders.env_loader'], # Use a custom loader to merge sections, but overwrite their overlapping values. Also support ENV variables to take precedence. 'root_path': join(current_dir, "settings"), #Used for Dynaconf.find_file() - So that root path points to settings folder, since we disabled all core loaders. 'merge_enabled': True # In case more than one file is sent, merge them. Must be set to True, otherwise, a .toml file with section [XYZ] overwrites the entire section of a previous .toml file's [XYZ] and we want it to only overwrite the overlapping fields under such section } global_settings = Dynaconf( envvar_prefix=False, load_dotenv=False, # Security: Don't load .env files settings_files=[join(current_dir, f) for f in [ "settings/configuration.toml", "settings/ignore.toml", "settings/generated_code_ignore.toml", "settings/language_extensions.toml", "settings/pr_reviewer_prompts.toml", "settings/pr_questions_prompts.toml", "settings/pr_line_questions_prompts.toml", "settings/pr_description_prompts.toml", "settings/code_suggestions/pr_code_suggestions_prompts.toml", "settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml", "settings/code_suggestions/pr_code_suggestions_reflect_prompts.toml", "settings/pr_information_from_user_prompts.toml", "settings/pr_update_changelog_prompts.toml", "settings/pr_custom_labels.toml", "settings/pr_add_docs.toml", "settings/custom_labels.toml", "settings/pr_help_prompts.toml", "settings/pr_help_docs_prompts.toml", "settings/pr_help_docs_headings_prompts.toml", "settings/.secrets.toml", "settings_prod/.secrets.toml", ]], **dynconf_kwargs ) def get_settings(use_context=False): """ Retrieves the current settings. This function attempts to fetch the settings from the starlette_context's context object. If it fails, it defaults to the global settings defined outside of this function. Returns: Dynaconf: The current settings object, either from the context or the global default. """ try: return context["settings"] except Exception: return global_settings # Add local configuration from pyproject.toml of the project being reviewed def _find_repository_root() -> Optional[Path]: """ Identify project root directory by recursively searching for the .git directory in the parent directories. """ cwd = Path.cwd().resolve() no_way_up = False while not no_way_up: no_way_up = cwd == cwd.parent if (cwd / ".git").is_dir(): return cwd cwd = cwd.parent return None def _find_pyproject() -> Optional[Path]: """ Search for file pyproject.toml in the repository root. """ repo_root = _find_repository_root() if repo_root: pyproject = repo_root / "pyproject.toml" return pyproject if pyproject.is_file() else None return None pyproject_path = _find_pyproject() if pyproject_path is not None: get_settings().load_file(pyproject_path, env=f'tool.{PR_AGENT_TOML_KEY}') def apply_secrets_manager_config(): """ Retrieve configuration from AWS Secrets Manager and override existing settings """ try: # Dynamic imports to avoid circular dependency (secret_providers imports config_loader) from pr_agent.secret_providers import get_secret_provider from pr_agent.log import get_logger secret_provider = get_secret_provider() if not secret_provider: return if (hasattr(secret_provider, 'get_all_secrets') and get_settings().get("CONFIG.SECRET_PROVIDER") == 'aws_secrets_manager'): try: secrets = secret_provider.get_all_secrets() if secrets: apply_secrets_to_config(secrets) get_logger().info("Applied AWS Secrets Manager configuration") except Exception as e: get_logger().error(f"Failed to apply AWS Secrets Manager config: {e}") except Exception as e: try: from pr_agent.log import get_logger get_logger().debug(f"Secret provider not configured: {e}") except: # Fail completely silently if log module is not available pass def apply_secrets_to_config(secrets: dict): """ Apply secret dictionary to configuration """ try: # Dynamic import to avoid potential circular dependency from pr_agent.log import get_logger except: def get_logger(): class DummyLogger: def debug(self, msg): pass return DummyLogger() for key, value in secrets.items(): if '.' in key: # nested key like "openai.key" parts = key.split('.') if len(parts) == 2: section, setting = parts section_upper = section.upper() setting_upper = setting.upper() # Set only when no existing value (prioritize environment variables) current_value = get_settings().get(f"{section_upper}.{setting_upper}") if current_value is None or current_value == "": get_settings().set(f"{section_upper}.{setting_upper}", value) get_logger().debug(f"Set {section}.{setting} from AWS Secrets Manager") ================================================ FILE: pr_agent/custom_merge_loader.py ================================================ from pathlib import Path import tomllib #tomllib should be used instead of Py toml for Python 3.11+ from jinja2.exceptions import SecurityError from pr_agent.log import get_logger def load(obj, env=None, silent=True, key=None, filename=None): """ Load and merge TOML configuration files into a Dynaconf settings object using a secure, in-house loader. This loader: - Replaces list and dict fields instead of appending/updating (non-default Dynaconf behavior). - Enforces several security checks (e.g., disallows includes/preloads and enforces .toml files). - Supports optional single-key loading. - Supports Dynaconf's fresh_vars feature for dynamic reloading. Args: obj: The Dynaconf settings instance to update. env: The current environment name (upper case). Defaults to 'DEVELOPMENT'. Note: currently unused. silent (bool): If True, suppress exceptions and log warnings/errors instead. key (str | None): Load only this top-level key (section) if provided; otherwise, load all keys from the files. filename (str | None): Custom filename for tests (not used when settings_files are provided). Returns: None """ MAX_TOML_SIZE_IN_BYTES = 100 * 1024 * 1024 # Prevent out of mem. exceptions by limiting to 100 MBs which is sufficient for upto 1M lines # Get the list of files to load # TODO: hasattr(obj, 'settings_files') for some reason returns False. Need to use 'settings_file' settings_files = obj.settings_files if hasattr(obj, 'settings_files') else ( obj.settings_file) if hasattr(obj, 'settings_file') else [] if not settings_files or not isinstance(settings_files, list): get_logger().warning("No settings files specified, or missing keys " "(tried looking for 'settings_files' or 'settings_file'), or not a list. Skipping loading.", artifact={'toml_obj_attributes_names': dir(obj)}) return # Storage for all loaded data accumulated_data = {} # Security: Check for forbidden configuration options if hasattr(obj, 'includes') and obj.includes: if not silent: raise SecurityError("Configuration includes forbidden option: 'includes'. Skipping loading.") get_logger().error("Configuration includes forbidden option: 'includes'. Skipping loading.") return if hasattr(obj, 'preload') and obj.preload: if not silent: raise SecurityError("Configuration includes forbidden option: 'preload'. Skipping loading.") get_logger().error("Configuration includes forbidden option: 'preload'. Skipping loading.") return for settings_file in settings_files: try: # Load the TOML file file_path = Path(settings_file) # Security: Only allow .toml files if file_path.suffix.lower() != '.toml': get_logger().warning(f"Only .toml files are allowed. Skipping: {settings_file}") continue if not file_path.exists(): get_logger().warning(f"Settings file not found: {settings_file}. Skipping it.") continue if file_path.stat().st_size > MAX_TOML_SIZE_IN_BYTES: get_logger().warning(f"Settings file too large (> {MAX_TOML_SIZE_IN_BYTES} bytes): {settings_file}. Skipping it.") continue with open(file_path, 'rb') as f: file_data = tomllib.load(f) # Handle sections (like [config], [default], etc.) if not isinstance(file_data, dict): get_logger().warning(f"TOML root is not a table in '{settings_file}'. Skipping.") continue # Security: Check file contents for forbidden directives validate_file_security(file_data, settings_file) for section_name, section_data in file_data.items(): if not isinstance(section_data, dict): get_logger().warning(f"Section '{section_name}' in '{settings_file}' is not a table. Skipping.") continue for field, field_value in section_data.items(): if section_name not in accumulated_data: accumulated_data[section_name] = {} accumulated_data[section_name][field] = field_value except Exception as e: if not silent: raise e get_logger().exception(f"Exception loading settings file: {settings_file}. Skipping.") # Update the settings object for k, v in accumulated_data.items(): # For fresh_vars support: key parameter is uppercase, but accumulated_data keys are lowercase if key is None or key.upper() == k.upper(): obj.set(k, v) def validate_file_security(file_data, filename): """ Validate that the config file does not contain security-sensitive directives. Args: file_data: Parsed TOML data representing the configuration contents. filename: The name or path of the file being validated (used for error messages). Raises: SecurityError: If forbidden directives are found within the configuration, or if data too nested. """ MAX_DEPTH = 50 # Check for forbidden keys # Comprehensive list of forbidden keys with explanations forbidden_keys_to_reasons = { # Include mechanisms - allow loading arbitrary files 'dynaconf_include': 'allows including other config files dynamically', 'dynaconf_includes': 'allows including other config files dynamically', 'includes': 'allows including other config files dynamically', # Preload mechanisms - allow loading files before main config 'preload': 'allows preloading files with potential code execution', 'preload_for_dynaconf': 'allows preloading files with potential code execution', 'preloads': 'allows preloading files with potential code execution', # Merge controls - could be used to manipulate config behavior 'dynaconf_merge': 'allows manipulating merge behavior', 'dynaconf_merge_enabled': 'allows manipulating merge behavior', 'merge_enabled': 'allows manipulating merge behavior', # Loader controls - allow changing how configs are loaded 'loaders_for_dynaconf': 'allows overriding loaders to execute arbitrary code', 'loaders': 'allows overriding loaders to execute arbitrary code', 'core_loaders': 'allows overriding core loaders', 'core_loaders_for_dynaconf': 'allows overriding core loaders', # Settings module - allows loading Python modules 'settings_module': 'allows loading Python modules with code execution', 'settings_file_for_dynaconf': 'could override settings file location', 'settings_files_for_dynaconf': 'could override settings file location', # Environment variable prefix manipulation 'envvar_prefix': 'allows changing environment variable prefix', 'envvar_prefix_for_dynaconf': 'allows changing environment variable prefix', } # Check at the top level and in all sections def check_dict(data, path="", max_depth=MAX_DEPTH): if max_depth <= 0: raise SecurityError( f"Maximum nesting depth exceeded at {path}. " f"Possible attempt to cause stack overflow." ) for key, value in data.items(): full_path = f"{path}.{key}" if path else key if key.lower() in forbidden_keys_to_reasons: raise SecurityError( f"Security error in {filename}: " f"Forbidden directive '{key}' found at {full_path}. Reason: {forbidden_keys_to_reasons[key.lower()]}" ) # Recursively check nested dicts if isinstance(value, dict): check_dict(value, path=full_path, max_depth=(max_depth - 1)) check_dict(file_data, max_depth=MAX_DEPTH) ================================================ FILE: pr_agent/git_providers/__init__.py ================================================ from starlette_context import context from pr_agent.config_loader import get_settings from pr_agent.git_providers.azuredevops_provider import AzureDevopsProvider from pr_agent.git_providers.bitbucket_provider import BitbucketProvider from pr_agent.git_providers.bitbucket_server_provider import \ BitbucketServerProvider from pr_agent.git_providers.codecommit_provider import CodeCommitProvider from pr_agent.git_providers.gerrit_provider import GerritProvider from pr_agent.git_providers.git_provider import GitProvider from pr_agent.git_providers.gitea_provider import GiteaProvider from pr_agent.git_providers.github_provider import GithubProvider from pr_agent.git_providers.gitlab_provider import GitLabProvider from pr_agent.git_providers.local_git_provider import LocalGitProvider from pr_agent.git_providers.gitea_provider import GiteaProvider _GIT_PROVIDERS = { 'github': GithubProvider, 'gitlab': GitLabProvider, 'bitbucket': BitbucketProvider, 'bitbucket_server': BitbucketServerProvider, 'azure': AzureDevopsProvider, 'codecommit': CodeCommitProvider, 'local': LocalGitProvider, 'gerrit': GerritProvider, 'gitea': GiteaProvider } def get_git_provider(): try: provider_id = get_settings().config.git_provider except AttributeError as e: raise ValueError("git_provider is a required attribute in the configuration file") from e if provider_id not in _GIT_PROVIDERS: raise ValueError(f"Unknown git provider: {provider_id}") return _GIT_PROVIDERS[provider_id] def get_git_provider_with_context(pr_url) -> GitProvider: """ Get a GitProvider instance for the given PR URL. If the GitProvider instance is already in the context, return it. """ is_context_env = None try: is_context_env = context.get("settings", None) except Exception: pass # we are not in a context environment (CLI) # check if context["git_provider"]["pr_url"] exists if is_context_env and context.get("git_provider", {}).get("pr_url", {}): git_provider = context["git_provider"]["pr_url"] # possibly check if the git_provider is still valid, or if some reset is needed # ... return git_provider else: try: provider_id = get_settings().config.git_provider if provider_id not in _GIT_PROVIDERS: raise ValueError(f"Unknown git provider: {provider_id}") git_provider = _GIT_PROVIDERS[provider_id](pr_url) if is_context_env: context["git_provider"] = {pr_url: git_provider} return git_provider except Exception as e: raise ValueError(f"Failed to get git provider for {pr_url}") from e ================================================ FILE: pr_agent/git_providers/azuredevops_provider.py ================================================ from __future__ import annotations import os from typing import Optional, Tuple from urllib.parse import urlparse from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from ..algo.file_filter import filter_ignored from ..algo.language_handler import is_valid_file from ..algo.utils import (PRDescriptionHeader, clip_tokens, find_line_number_of_relevant_line_in_file, load_large_diff) from ..config_loader import get_settings from ..log import get_logger from .git_provider import GitProvider AZURE_DEVOPS_AVAILABLE = True ADO_APP_CLIENT_DEFAULT_ID = "499b84ac-1321-427f-aa17-267ca6975798/.default" MAX_PR_DESCRIPTION_AZURE_LENGTH = 4000-1 try: # noinspection PyUnresolvedReferences from azure.devops.connection import Connection # noinspection PyUnresolvedReferences from azure.devops.released.git import (Comment, CommentThread, GitPullRequest, GitVersionDescriptor, GitClient, CommentThreadContext, CommentPosition) from azure.devops.released.work_item_tracking import WorkItemTrackingClient # noinspection PyUnresolvedReferences from azure.identity import DefaultAzureCredential from msrest.authentication import BasicAuthentication except ImportError: AZURE_DEVOPS_AVAILABLE = False class AzureDevopsProvider(GitProvider): def __init__( self, pr_url: Optional[str] = None, incremental: Optional[bool] = False ): if not AZURE_DEVOPS_AVAILABLE: raise ImportError( "Azure DevOps provider is not available. Please install the required dependencies." ) self.azure_devops_client, self.azure_devops_board_client = self._get_azure_devops_client() self.diff_files = None self.workspace_slug = None self.repo_slug = None self.repo = None self.pr_num = None self.pr = None self.temp_comments = [] self.incremental = incremental if pr_url: self.set_pr(pr_url) def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ post_parameters_list = [] status = get_settings().azure_devops.get("default_comment_status", "closed") for suggestion in code_suggestions: body = suggestion['body'] relevant_file = suggestion['relevant_file'] relevant_lines_start = suggestion['relevant_lines_start'] relevant_lines_end = suggestion['relevant_lines_end'] if not relevant_lines_start or relevant_lines_start == -1: get_logger().warning( f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}") continue if relevant_lines_end < relevant_lines_start: get_logger().warning(f"Failed to publish code suggestion, " f"relevant_lines_end is {relevant_lines_end} and " f"relevant_lines_start is {relevant_lines_start}") continue thread_context = CommentThreadContext( file_path=relevant_file, right_file_start=CommentPosition(offset=1, line=relevant_lines_start), right_file_end=CommentPosition(offset=1, line=relevant_lines_end)) comment = Comment(content=body, comment_type=1) thread = CommentThread(comments=[comment], thread_context=thread_context, status=status) try: self.azure_devops_client.create_thread( comment_thread=thread, project=self.workspace_slug, repository_id=self.repo_slug, pull_request_id=self.pr_num ) except Exception as e: get_logger().error(f"Azure failed to publish code suggestion, error: {e}", suggestion=suggestion) return True def reply_to_comment_from_comment_id(self, comment_id: int, body: str, is_temporary: bool = False) -> Comment: # comment_id is actually thread_id return self.reply_to_thread(comment_id, body, is_temporary) def get_pr_description_full(self) -> str: return self.pr.description def edit_comment(self, comment: Comment, body: str): try: self.azure_devops_client.update_comment( repository_id=self.repo_slug, pull_request_id=self.pr_num, thread_id=comment.thread_id, comment_id=comment.id, comment=Comment(content=body), project=self.workspace_slug, ) except Exception as e: get_logger().exception(f"Failed to edit comment, error: {e}") def remove_comment(self, comment: Comment): try: self.azure_devops_client.delete_comment( repository_id=self.repo_slug, pull_request_id=self.pr_num, thread_id=comment.thread_id, comment_id=comment.id, project=self.workspace_slug, ) except Exception as e: get_logger().exception(f"Failed to remove comment, error: {e}") def publish_labels(self, pr_types): try: for pr_type in pr_types: self.azure_devops_client.create_pull_request_label( label={"name": pr_type}, project=self.workspace_slug, repository_id=self.repo_slug, pull_request_id=self.pr_num, ) except Exception as e: get_logger().warning(f"Failed to publish labels, error: {e}") def get_pr_labels(self, update=False): try: labels = self.azure_devops_client.get_pull_request_labels( project=self.workspace_slug, repository_id=self.repo_slug, pull_request_id=self.pr_num, ) return [label.name for label in labels] except Exception as e: get_logger().exception(f"Failed to get labels, error: {e}") return [] def is_supported(self, capability: str) -> bool: return True def set_pr(self, pr_url: str): self.pr_url = pr_url self.workspace_slug, self.repo_slug, self.pr_num = self._parse_pr_url(pr_url) self.pr = self._get_pr() def get_repo_settings(self): try: contents = self.azure_devops_client.get_item_content( repository_id=self.repo_slug, project=self.workspace_slug, download=False, include_content_metadata=False, include_content=True, path=".pr_agent.toml", ) return list(contents)[0] except Exception as e: if get_settings().config.verbosity_level >= 2: get_logger().error(f"Failed to get repo settings, error: {e}") return "" def get_files(self): files = [] for i in self.azure_devops_client.get_pull_request_commits( project=self.workspace_slug, repository_id=self.repo_slug, pull_request_id=self.pr_num, ): changes_obj = self.azure_devops_client.get_changes( project=self.workspace_slug, repository_id=self.repo_slug, commit_id=i.commit_id, ) for c in changes_obj.changes: files.append(c["item"]["path"]) return list(set(files)) def get_diff_files(self) -> list[FilePatchInfo]: try: if self.diff_files: return self.diff_files base_sha = self.pr.last_merge_target_commit head_sha = self.pr.last_merge_commit # Get PR iterations iterations = self.azure_devops_client.get_pull_request_iterations( repository_id=self.repo_slug, pull_request_id=self.pr_num, project=self.workspace_slug ) changes = None if iterations: iteration_id = iterations[-1].id # Get the last iteration (most recent changes) # Get changes for the iteration changes = self.azure_devops_client.get_pull_request_iteration_changes( repository_id=self.repo_slug, pull_request_id=self.pr_num, iteration_id=iteration_id, project=self.workspace_slug ) diff_files = [] diffs = [] diff_types = {} if changes: for change in changes.change_entries: item = change.additional_properties.get('item', {}) path = item.get('path', None) if path: diffs.append(path) diff_types[path] = change.additional_properties.get('changeType', 'Unknown') # wrong implementation - gets all the files that were changed in any commit in the PR # commits = self.azure_devops_client.get_pull_request_commits( # project=self.workspace_slug, # repository_id=self.repo_slug, # pull_request_id=self.pr_num, # ) # # diff_files = [] # diffs = [] # diff_types = {} # for c in commits: # changes_obj = self.azure_devops_client.get_changes( # project=self.workspace_slug, # repository_id=self.repo_slug, # commit_id=c.commit_id, # ) # for i in changes_obj.changes: # if i["item"]["gitObjectType"] == "tree": # continue # diffs.append(i["item"]["path"]) # diff_types[i["item"]["path"]] = i["changeType"] # # diffs = list(set(diffs)) diffs_original = diffs diffs = filter_ignored(diffs_original, 'azure') if diffs_original != diffs: try: get_logger().info(f"Filtered out [ignore] files for pull request:", extra= {"files": diffs_original, # diffs is just a list of names "filtered_files": diffs}) except Exception: pass invalid_files_names = [] for file in diffs: if not is_valid_file(file): invalid_files_names.append(file) continue version = GitVersionDescriptor( version=head_sha.commit_id, version_type="commit" ) try: new_file_content_str = self.azure_devops_client.get_item( repository_id=self.repo_slug, path=file, project=self.workspace_slug, version_descriptor=version, download=False, include_content=True, ) new_file_content_str = new_file_content_str.content except Exception as error: get_logger().error(f"Failed to retrieve new file content of {file} at version {version}", error=error) # get_logger().error( # "Failed to retrieve new file content of %s at version %s. Error: %s", # file, # version, # str(error), # ) new_file_content_str = "" edit_type = EDIT_TYPE.MODIFIED if diff_types[file] == "add": edit_type = EDIT_TYPE.ADDED elif diff_types[file] == "delete": edit_type = EDIT_TYPE.DELETED elif "rename" in diff_types[file]: # diff_type can be `rename` | `edit, rename` edit_type = EDIT_TYPE.RENAMED version = GitVersionDescriptor( version=base_sha.commit_id, version_type="commit" ) if edit_type == EDIT_TYPE.ADDED or edit_type == EDIT_TYPE.RENAMED: original_file_content_str = "" else: try: original_file_content_str = self.azure_devops_client.get_item( repository_id=self.repo_slug, path=file, project=self.workspace_slug, version_descriptor=version, download=False, include_content=True, ) original_file_content_str = original_file_content_str.content except Exception as error: get_logger().error(f"Failed to retrieve original file content of {file} at version {version}", error=error) original_file_content_str = "" patch = load_large_diff( file, new_file_content_str, original_file_content_str, show_warning=False ).rstrip() # count number of lines added and removed patch_lines = patch.splitlines(keepends=True) num_plus_lines = len([line for line in patch_lines if line.startswith('+')]) num_minus_lines = len([line for line in patch_lines if line.startswith('-')]) diff_files.append( FilePatchInfo( original_file_content_str, new_file_content_str, patch=patch, filename=file, edit_type=edit_type, num_plus_lines=num_plus_lines, num_minus_lines=num_minus_lines, ) ) get_logger().info(f"Invalid files: {invalid_files_names}") self.diff_files = diff_files return diff_files except Exception as e: get_logger().exception(f"Failed to get diff files, error: {e}") return [] def publish_comment(self, pr_comment: str, is_temporary: bool = False, thread_context=None) -> Comment: if is_temporary and not get_settings().config.publish_output_progress: get_logger().debug(f"Skipping publish_comment for temporary comment: {pr_comment}") return None comment = Comment(content=pr_comment) status = get_settings().azure_devops.get("default_comment_status", "closed") thread = CommentThread(comments=[comment], thread_context=thread_context, status=status) thread_response = self.azure_devops_client.create_thread( comment_thread=thread, project=self.workspace_slug, repository_id=self.repo_slug, pull_request_id=self.pr_num, ) created_comment = thread_response.comments[0] created_comment.thread_id = thread_response.id if is_temporary: self.temp_comments.append(created_comment) return created_comment def publish_persistent_comment(self, pr_comment: str, initial_header: str, update_header: bool = True, name='review', final_update_message=True): return self.publish_persistent_comment_full(pr_comment, initial_header, update_header, name, final_update_message) def publish_description(self, pr_title: str, pr_body: str): if len(pr_body) > MAX_PR_DESCRIPTION_AZURE_LENGTH: usage_guide_text='
      ✨ Describe tool usage guide:
      ' ind = pr_body.find(usage_guide_text) if ind != -1: pr_body = pr_body[:ind] if len(pr_body) > MAX_PR_DESCRIPTION_AZURE_LENGTH: changes_walkthrough_text = PRDescriptionHeader.FILE_WALKTHROUGH.value ind = pr_body.find(changes_walkthrough_text) if ind != -1: pr_body = pr_body[:ind] if len(pr_body) > MAX_PR_DESCRIPTION_AZURE_LENGTH: trunction_message = " ... (description truncated due to length limit)" pr_body = pr_body[:MAX_PR_DESCRIPTION_AZURE_LENGTH - len(trunction_message)] + trunction_message get_logger().warning("PR description was truncated due to length limit") try: updated_pr = GitPullRequest() updated_pr.title = pr_title updated_pr.description = pr_body self.azure_devops_client.update_pull_request( project=self.workspace_slug, repository_id=self.repo_slug, pull_request_id=self.pr_num, git_pull_request_to_update=updated_pr, ) except Exception as e: get_logger().exception( f"Could not update pull request {self.pr_num} description: {e}" ) def remove_initial_comment(self): try: for comment in self.temp_comments: self.remove_comment(comment) except Exception as e: get_logger().exception(f"Failed to remove temp comments, error: {e}") def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)]) def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, absolute_position: int = None): position, absolute_position = find_line_number_of_relevant_line_in_file(self.get_diff_files(), relevant_file.strip('`'), relevant_line_in_file, absolute_position) if position == -1: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") subject_type = "FILE" else: subject_type = "LINE" path = relevant_file.strip() return dict(body=body, path=path, position=position, absolute_position=absolute_position) if subject_type == "LINE" else {} def publish_inline_comments(self, comments: list[dict], disable_fallback: bool = False): overall_success = True for comment in comments: try: self.publish_comment(comment["body"], thread_context={ "filePath": comment["path"], "rightFileStart": { "line": comment["absolute_position"], "offset": comment["position"], }, "rightFileEnd": { "line": comment["absolute_position"], "offset": comment["position"], }, }) if get_settings().config.verbosity_level >= 2: get_logger().info( f"Published code suggestion on {self.pr_num} at {comment['path']}" ) except Exception as e: if get_settings().config.verbosity_level >= 2: get_logger().error(f"Failed to publish code suggestion, error: {e}") overall_success = False return overall_success def get_title(self): return self.pr.title def get_languages(self): languages = [] files = self.azure_devops_client.get_items( project=self.workspace_slug, repository_id=self.repo_slug, recursion_level="Full", include_content_metadata=True, include_links=False, download=False, ) for f in files: if f.git_object_type == "blob": file_name, file_extension = os.path.splitext(f.path) languages.append(file_extension[1:]) extension_counts = {} for ext in languages: if ext != "": extension_counts[ext] = extension_counts.get(ext, 0) + 1 total_extensions = sum(extension_counts.values()) extension_percentages = { ext: (count / total_extensions) * 100 for ext, count in extension_counts.items() } return extension_percentages def get_pr_branch(self): pr_info = self.azure_devops_client.get_pull_request_by_id( project=self.workspace_slug, pull_request_id=self.pr_num ) source_branch = pr_info.source_ref_name.split("/")[-1] return source_branch def get_user_id(self): return 0 def get_issue_comments(self) -> list[Comment]: threads = self.azure_devops_client.get_threads(repository_id=self.repo_slug, pull_request_id=self.pr_num, project=self.workspace_slug) threads.reverse() comment_list = [] for thread in threads: for comment in thread.comments: if comment.content and comment not in comment_list: comment.body = comment.content comment.thread_id = thread.id comment_list.append(comment) return comment_list def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: return True def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: return True def set_like(self, thread_id: int, comment_id: int, create: bool = True): if create: self.azure_devops_client.create_like(self.repo_slug, self.pr_num, thread_id, comment_id, project=self.workspace_slug) else: self.azure_devops_client.delete_like(self.repo_slug, self.pr_num, thread_id, comment_id, project=self.workspace_slug) def set_thread_status(self, thread_id: int, status: str): try: self.azure_devops_client.update_thread(CommentThread(status=status), self.repo_slug, self.pr_num, thread_id, self.workspace_slug) except Exception as e: get_logger().exception(f"Failed to set thread status, error: {e}") def reply_to_thread(self, thread_id: int, body: str, is_temporary: bool = False) -> Comment: try: comment = Comment(content=body) response = self.azure_devops_client.create_comment(comment, self.repo_slug, self.pr_num, thread_id, self.workspace_slug) response.thread_id = thread_id if is_temporary: self.temp_comments.append(response) return response except Exception as e: get_logger().exception(f"Failed to reply to thread, error: {e}") def get_thread_context(self, thread_id: int) -> CommentThreadContext: try: thread = self.azure_devops_client.get_pull_request_thread(self.repo_slug, self.pr_num, thread_id, self.workspace_slug) return thread.thread_context except Exception as e: get_logger().exception(f"Failed to set thread status, error: {e}") @staticmethod def _parse_pr_url(pr_url: str) -> Tuple[str, str, int]: parsed_url = urlparse(pr_url) path_parts = parsed_url.path.strip("/").split("/") num_parts = len(path_parts) if num_parts < 5: raise ValueError("The provided URL has insufficient path components for an Azure DevOps PR URL") # Verify that the second-to-last path component is "pullrequest" if path_parts[num_parts - 2] != "pullrequest": raise ValueError("The provided URL does not follow the expected Azure DevOps PR URL format") workspace_slug = path_parts[num_parts - 5] repo_slug = path_parts[num_parts - 3] try: pr_number = int(path_parts[num_parts - 1]) except ValueError as e: raise ValueError("Cannot parse PR number in the provided URL") from e return workspace_slug, repo_slug, pr_number @staticmethod def _get_azure_devops_client() -> Tuple[GitClient, WorkItemTrackingClient]: org = get_settings().azure_devops.get("org", None) pat = get_settings().azure_devops.get("pat", None) if not org: raise ValueError("Azure DevOps organization is required") if pat: auth_token = pat else: try: # try to use azure default credentials # see https://learn.microsoft.com/en-us/python/api/overview/azure/identity-readme?view=azure-python # for usage and env var configuration of user-assigned managed identity, local machine auth etc. get_logger().info("No PAT found in settings, trying to use Azure Default Credentials.") credentials = DefaultAzureCredential() accessToken = credentials.get_token(ADO_APP_CLIENT_DEFAULT_ID) auth_token = accessToken.token except Exception as e: get_logger().error(f"No PAT found in settings, and Azure Default Authentication failed, error: {e}") raise credentials = BasicAuthentication("", auth_token) azure_devops_connection = Connection(base_url=org, creds=credentials) azure_devops_client = azure_devops_connection.clients.get_git_client() azure_devops_board_client = azure_devops_connection.clients.get_work_item_tracking_client() return azure_devops_client, azure_devops_board_client def _get_repo(self): if self.repo is None: self.repo = self.azure_devops_client.get_repository( project=self.workspace_slug, repository_id=self.repo_slug ) return self.repo def _get_pr(self): self.pr = self.azure_devops_client.get_pull_request_by_id( pull_request_id=self.pr_num, project=self.workspace_slug ) return self.pr def get_commit_messages(self): return "" # not implemented yet def get_pr_id(self): try: pr_id = f"{self.workspace_slug}/{self.repo_slug}/{self.pr_num}" return pr_id except Exception as e: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Failed to get PR id, error: {e}") return "" def publish_file_comments(self, file_comments: list) -> bool: pass def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: return self.pr_url+f"?_a=files&path={relevant_file}" def get_comment_url(self, comment) -> str: return self.pr_url + "?discussionId=" + str(comment.thread_id) def get_latest_commit_url(self) -> str: commits = self.azure_devops_client.get_pull_request_commits(self.repo_slug, self.pr_num, self.workspace_slug) last = commits[0] url = self.azure_devops_client.normalized_url + "/" + self.workspace_slug + "/_git/" + self.repo_slug + "/commit/" + last.commit_id return url def get_linked_work_items(self) -> list: """ Get linked work items from the PR. """ try: work_items = self.azure_devops_client.get_pull_request_work_item_refs( project=self.workspace_slug, repository_id=self.repo_slug, pull_request_id=self.pr_num, ) ids = [work_item.id for work_item in work_items] if not work_items: return [] items = self.get_work_items(ids) return items except Exception as e: get_logger().exception(f"Failed to get linked work items, error: {e}") return [] def get_work_items(self, work_item_ids: list) -> list: """ Get work items by their IDs. """ try: raw_work_items = self.azure_devops_board_client.get_work_items( project=self.workspace_slug, ids=work_item_ids, ) work_items = [] for item in raw_work_items: work_items.append( { "id": item.id, "title": item.fields.get("System.Title", ""), "url": item.url, "body": item.fields.get("System.Description", ""), "acceptance_criteria": item.fields.get( "Microsoft.VSTS.Common.AcceptanceCriteria", "" ), "tags": item.fields.get("System.Tags", "").split("; ") if item.fields.get("System.Tags") else [], } ) return work_items except Exception as e: get_logger().exception(f"Failed to get work items, error: {e}") return [] ================================================ FILE: pr_agent/git_providers/bitbucket_provider.py ================================================ import difflib import json import re from typing import Optional, Tuple from urllib.parse import urlparse import requests from atlassian.bitbucket import Cloud from starlette_context import context from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from ..algo.file_filter import filter_ignored from ..algo.language_handler import is_valid_file from ..algo.utils import find_line_number_of_relevant_line_in_file from ..config_loader import get_settings from ..log import get_logger from .git_provider import MAX_FILES_ALLOWED_FULL, GitProvider def _gef_filename(diff): if diff.new.path: return diff.new.path return diff.old.path class BitbucketProvider(GitProvider): def __init__( self, pr_url: Optional[str] = None, incremental: Optional[bool] = False ): s = requests.Session() s.headers["Content-Type"] = "application/json" self.auth_type = get_settings().get("BITBUCKET.AUTH_TYPE", "bearer") try: def get_token(token_name, auth_type_name): token = get_settings().get(f"BITBUCKET.{token_name.upper()}", None) if not token: raise ValueError(f"{auth_type_name} auth requires a token") return token if self.auth_type == "basic": self.basic_token = get_token("basic_token", "Basic") s.headers["Authorization"] = f"Basic {self.basic_token}" elif self.auth_type == "bearer": try: self.bearer_token = context.get("bitbucket_bearer_token", None) except: self.bearer_token = None if not self.bearer_token: self.bearer_token = get_token("bearer_token", "Bearer") s.headers["Authorization"] = f"Bearer {self.bearer_token}" else: raise ValueError(f"Unsupported auth_type: {self.auth_type}") except Exception as e: get_logger().exception(f"Failed to initialize Bitbucket authentication: {e}") raise self.headers = s.headers self.bitbucket_client = Cloud(session=s) self.max_comment_length = 31000 self.workspace_slug = None self.repo_slug = None self.repo = None self.pr_num = None self.pr = None self.pr_url = pr_url self.temp_comments = [] self.incremental = incremental self.diff_files = None self.git_files = None if pr_url: self.set_pr(pr_url) self.bitbucket_comment_api_url = self.pr._BitbucketBase__data["links"]["comments"]["href"] self.bitbucket_pull_request_api_url = self.pr._BitbucketBase__data["links"]['self']['href'] def get_repo_settings(self): try: url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/" f"{self.pr.destination_branch}/.pr_agent.toml") response = requests.request("GET", url, headers=self.headers) if response.status_code == 404: # not found return "" contents = response.text.encode('utf-8') return contents except Exception: return "" def get_git_repo_url(self, pr_url: str=None) -> str: #bitbucket does not support issue url, so ignore param try: parsed_url = urlparse(self.pr_url) return f"{parsed_url.scheme}://{parsed_url.netloc}/{self.workspace_slug}/{self.repo_slug}.git" except Exception as e: get_logger().exception(f"url is not a valid merge requests url: {self.pr_url}") return "" # Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo. # Example: git clone git clone https://bitbucket.org/codiumai/pr-agent.git and branch: main -> prefix: "https://bitbucket.org/codiumai/pr-agent/src/main", suffix: "" # In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix. def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]: scheme_and_netloc = None if repo_git_url: parsed_git_url = urlparse(repo_git_url) scheme_and_netloc = parsed_git_url.scheme + "://" + parsed_git_url.netloc repo_path = parsed_git_url.path.split('.git')[0][1:] #//.git -> / if repo_path.count('/') != 1: get_logger().error(f"repo_git_url is not a valid git repo url: {repo_git_url}") return ("", "") workspace_name, project_name = repo_path.split('/') else: desired_branch = self.get_repo_default_branch() parsed_pr_url = urlparse(self.pr_url) scheme_and_netloc = parsed_pr_url.scheme + "://" + parsed_pr_url.netloc workspace_name, project_name = (self.workspace_slug, self.repo_slug) prefix = f"{scheme_and_netloc}/{workspace_name}/{project_name}/src/{desired_branch}" suffix = "" #None return (prefix, suffix) def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ post_parameters_list = [] for suggestion in code_suggestions: body = suggestion["body"] original_suggestion = suggestion.get('original_suggestion', None) # needed for diff code if original_suggestion: try: existing_code = original_suggestion['existing_code'].rstrip() + "\n" improved_code = original_suggestion['improved_code'].rstrip() + "\n" diff = difflib.unified_diff(existing_code.split('\n'), improved_code.split('\n'), n=999) patch_orig = "\n".join(diff) patch = "\n".join(patch_orig.splitlines()[5:]).strip('\n') diff_code = f"\n\n```diff\n{patch.rstrip()}\n```" # replace ```suggestion ... ``` with diff_code, using regex: body = re.sub(r'```suggestion.*?```', diff_code, body, flags=re.DOTALL) except Exception as e: get_logger().exception(f"Bitbucket failed to get diff code for publishing, error: {e}") continue relevant_file = suggestion["relevant_file"] relevant_lines_start = suggestion["relevant_lines_start"] relevant_lines_end = suggestion["relevant_lines_end"] if not relevant_lines_start or relevant_lines_start == -1: get_logger().exception( f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}" ) continue if relevant_lines_end < relevant_lines_start: get_logger().exception( f"Failed to publish code suggestion, " f"relevant_lines_end is {relevant_lines_end} and " f"relevant_lines_start is {relevant_lines_start}" ) continue if relevant_lines_end > relevant_lines_start: post_parameters = { "body": body, "path": relevant_file, "line": relevant_lines_end, "start_line": relevant_lines_start, "start_side": "RIGHT", } else: # API is different for single line comments post_parameters = { "body": body, "path": relevant_file, "line": relevant_lines_start, "side": "RIGHT", } post_parameters_list.append(post_parameters) try: self.publish_inline_comments(post_parameters_list) return True except Exception as e: get_logger().error(f"Bitbucket failed to publish code suggestion, error: {e}") return False def publish_file_comments(self, file_comments: list) -> bool: pass def is_supported(self, capability: str) -> bool: if capability in ['get_issue_comments', 'publish_inline_comments', 'get_labels', 'gfm_markdown', 'publish_file_comments']: return False return True def set_pr(self, pr_url: str): self.workspace_slug, self.repo_slug, self.pr_num = self._parse_pr_url(pr_url) self.pr = self._get_pr() def get_files(self): try: git_files = context.get("git_files", None) if git_files: return git_files self.git_files = [_gef_filename(diff) for diff in self.pr.diffstat()] context["git_files"] = self.git_files return self.git_files except Exception: if not self.git_files: self.git_files = [_gef_filename(diff) for diff in self.pr.diffstat()] return self.git_files def get_diff_files(self) -> list[FilePatchInfo]: if self.diff_files: return self.diff_files diffs_original = list(self.pr.diffstat()) diffs = filter_ignored(diffs_original, 'bitbucket') if diffs != diffs_original: try: names_original = [d.new.path for d in diffs_original] names_kept = [d.new.path for d in diffs] names_filtered = list(set(names_original) - set(names_kept)) get_logger().info(f"Filtered out [ignore] files for PR", extra={ 'original_files': names_original, 'names_kept': names_kept, 'names_filtered': names_filtered }) except Exception as e: pass # get the pr patches try: pr_patches = self.pr.diff() except Exception as e: # Try different encodings if UTF-8 fails get_logger().warning(f"Failed to decode PR patch with utf-8, error: {e}") encodings_to_try = ['iso-8859-1', 'latin-1', 'ascii', 'utf-16'] pr_patches = None for encoding in encodings_to_try: try: pr_patches = self.pr.diff(encoding=encoding) get_logger().info(f"Successfully decoded PR patch with encoding {encoding}") break except UnicodeDecodeError: continue if pr_patches is None: raise ValueError(f"Failed to decode PR patch with encodings {encodings_to_try}") diff_split = ["diff --git" + x for x in pr_patches.split("diff --git") if x.strip()] # filter all elements of 'diff_split' that are of indices in 'diffs_original' that are not in 'diffs' if len(diff_split) > len(diffs) and len(diffs_original) == len(diff_split): diff_split = [diff_split[i] for i in range(len(diff_split)) if diffs_original[i] in diffs] if len(diff_split) != len(diffs): get_logger().error(f"Error - failed to split the diff into {len(diffs)} parts") return [] # bitbucket diff has a header for each file, we need to remove it: # "diff --git filename # new file mode 100644 (optional) # index caa56f0..61528d7 100644 # --- a/pr_agent/cli_pip.py # +++ b/pr_agent/cli_pip.py # @@ -... @@" for i, _ in enumerate(diff_split): diff_split_lines = diff_split[i].splitlines() if (len(diff_split_lines) >= 6) and \ ((diff_split_lines[2].startswith("---") and diff_split_lines[3].startswith("+++") and diff_split_lines[4].startswith("@@")) or (diff_split_lines[3].startswith("---") and # new or deleted file diff_split_lines[4].startswith("+++") and diff_split_lines[5].startswith("@@"))): diff_split[i] = "\n".join(diff_split_lines[4:]) else: if diffs[i].data.get('lines_added', 0) == 0 and diffs[i].data.get('lines_removed', 0) == 0: diff_split[i] = "" elif len(diff_split_lines) <= 3: diff_split[i] = "" get_logger().info(f"Disregarding empty diff for file {_gef_filename(diffs[i])}") else: get_logger().warning(f"Bitbucket failed to get diff for file {_gef_filename(diffs[i])}") diff_split[i] = "" invalid_files_names = [] diff_files = [] counter_valid = 0 # get full files for index, diff in enumerate(diffs): file_path = _gef_filename(diff) if not is_valid_file(file_path): invalid_files_names.append(file_path) continue try: counter_valid += 1 if get_settings().get("bitbucket_app.avoid_full_files", False): original_file_content_str = "" new_file_content_str = "" elif counter_valid < MAX_FILES_ALLOWED_FULL // 2: # factor 2 because bitbucket has limited API calls if diff.old.get_data("links"): original_file_content_str = self._get_pr_file_content( diff.old.get_data("links")['self']['href']) else: original_file_content_str = "" if diff.new.get_data("links"): new_file_content_str = self._get_pr_file_content(diff.new.get_data("links")['self']['href']) else: new_file_content_str = "" else: if counter_valid == MAX_FILES_ALLOWED_FULL // 2: get_logger().info( f"Bitbucket too many files in PR, will avoid loading full content for rest of files") original_file_content_str = "" new_file_content_str = "" except Exception as e: get_logger().exception(f"Error - bitbucket failed to get file content, error: {e}") original_file_content_str = "" new_file_content_str = "" file_patch_canonic_structure = FilePatchInfo( original_file_content_str, new_file_content_str, diff_split[index], file_path, ) if diff.data['status'] == 'added': file_patch_canonic_structure.edit_type = EDIT_TYPE.ADDED elif diff.data['status'] == 'removed': file_patch_canonic_structure.edit_type = EDIT_TYPE.DELETED elif diff.data['status'] == 'modified': file_patch_canonic_structure.edit_type = EDIT_TYPE.MODIFIED elif diff.data['status'] == 'renamed': file_patch_canonic_structure.edit_type = EDIT_TYPE.RENAMED diff_files.append(file_patch_canonic_structure) if invalid_files_names: get_logger().info(f"Disregarding files with invalid extensions:\n{invalid_files_names}") self.diff_files = diff_files return diff_files def get_latest_commit_url(self): return self.pr.data['source']['commit']['links']['html']['href'] def get_comment_url(self, comment): return comment.data['links']['html']['href'] def publish_persistent_comment(self, pr_comment: str, initial_header: str, update_header: bool = True, name='review', final_update_message=True): try: for comment in self.pr.comments(): body = comment.raw if initial_header in body: latest_commit_url = self.get_latest_commit_url() comment_url = self.get_comment_url(comment) if update_header: updated_header = f"{initial_header}\n\n#### ({name.capitalize()} updated until commit {latest_commit_url})\n" pr_comment_updated = pr_comment.replace(initial_header, updated_header) else: pr_comment_updated = pr_comment get_logger().info(f"Persistent mode - updating comment {comment_url} to latest {name} message") d = {"content": {"raw": pr_comment_updated}} response = comment._update_data(comment.put(None, data=d)) if final_update_message: self.publish_comment( f"**[Persistent {name}]({comment_url})** updated to latest commit {latest_commit_url}") return except Exception as e: get_logger().exception(f"Failed to update persistent review, error: {e}") pass self.publish_comment(pr_comment) def publish_comment(self, pr_comment: str, is_temporary: bool = False): if is_temporary and not get_settings().config.publish_output_progress: get_logger().debug(f"Skipping publish_comment for temporary comment: {pr_comment}") return None pr_comment = self.limit_output_characters(pr_comment, self.max_comment_length) comment = self.pr.comment(pr_comment) if is_temporary: self.temp_comments.append(comment["id"]) return comment def edit_comment(self, comment, body: str): try: body = self.limit_output_characters(body, self.max_comment_length) comment.update(body) except Exception as e: get_logger().exception(f"Failed to update comment, error: {e}") def remove_initial_comment(self): try: for comment in self.temp_comments: self.remove_comment(comment) except Exception as e: get_logger().exception(f"Failed to remove temp comments, error: {e}") def remove_comment(self, comment): try: self.pr.delete(f"comments/{comment}") except Exception as e: get_logger().exception(f"Failed to remove comment, error: {e}") # function to create_inline_comment def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, absolute_position: int = None): body = self.limit_output_characters(body, self.max_comment_length) position, absolute_position = find_line_number_of_relevant_line_in_file(self.get_diff_files(), relevant_file.strip('`'), relevant_line_in_file, absolute_position) if position == -1: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") subject_type = "FILE" else: subject_type = "LINE" path = relevant_file.strip() return dict(body=body, path=path, position=absolute_position) if subject_type == "LINE" else {} def publish_inline_comment(self, comment: str, from_line: int, file: str, original_suggestion=None): comment = self.limit_output_characters(comment, self.max_comment_length) payload = json.dumps({ "content": { "raw": comment, }, "inline": { "to": from_line, "path": file }, }) response = requests.request( "POST", self.bitbucket_comment_api_url, data=payload, headers=self.headers ) return response def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: if relevant_line_start == -1: link = f"{self.pr_url}/#L{relevant_file}" else: link = f"{self.pr_url}/#L{relevant_file}T{relevant_line_start}" return link def generate_link_to_relevant_line_number(self, suggestion) -> str: try: relevant_file = suggestion['relevant_file'].strip('`').strip("'").rstrip() relevant_line_str = suggestion['relevant_line'].rstrip() if not relevant_line_str: return "" diff_files = self.get_diff_files() position, absolute_position = find_line_number_of_relevant_line_in_file \ (diff_files, relevant_file, relevant_line_str) if absolute_position != -1 and self.pr_url: link = f"{self.pr_url}/#L{relevant_file}T{absolute_position}" return link except Exception as e: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Failed adding line link, error: {e}") return "" def publish_inline_comments(self, comments: list[dict]): for comment in comments: if 'position' in comment: self.publish_inline_comment(comment['body'], comment['position'], comment['path']) elif 'start_line' in comment: # multi-line comment # note that bitbucket does not seem to support range - only a comment on a single line - https://community.developer.atlassian.com/t/api-post-endpoint-for-inline-pull-request-comments/60452 self.publish_inline_comment(comment['body'], comment['start_line'], comment['path']) elif 'line' in comment: # single-line comment self.publish_inline_comment(comment['body'], comment['line'], comment['path']) else: get_logger().error(f"Could not publish inline comment {comment}") def get_title(self): return self.pr.title def get_languages(self): languages = {self._get_repo().get_data("language"): 0} return languages def get_pr_branch(self): return self.pr.source_branch # This function attempts to get the default branch of the repository. As a fallback, uses the PR destination branch. # Note: Must be running from a PR context. def get_repo_default_branch(self): try: url_repo = f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/" response_repo = requests.request("GET", url_repo, headers=self.headers).json() return response_repo['mainbranch']['name'] except: return self.pr.destination_branch def get_pr_owner_id(self) -> str | None: return self.workspace_slug def get_pr_description_full(self): return self.pr.description def get_user_id(self): return 0 def get_issue_comments(self): raise NotImplementedError( "Bitbucket provider does not support issue comments yet" ) def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: return True def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: return True @staticmethod def _parse_pr_url(pr_url: str) -> Tuple[str, int, int]: parsed_url = urlparse(pr_url) if "bitbucket.org" not in parsed_url.netloc: raise ValueError("The provided URL is not a valid Bitbucket URL") path_parts = parsed_url.path.strip("/").split("/") if len(path_parts) < 4 or path_parts[2] != "pull-requests": raise ValueError( "The provided URL does not appear to be a Bitbucket PR URL" ) workspace_slug = path_parts[0] repo_slug = path_parts[1] try: pr_number = int(path_parts[3]) except ValueError as e: raise ValueError("Unable to convert PR number to integer") from e return workspace_slug, repo_slug, pr_number def _get_repo(self): if self.repo is None: self.repo = self.bitbucket_client.workspaces.get( self.workspace_slug ).repositories.get(self.repo_slug) return self.repo def _get_pr(self): return self._get_repo().pullrequests.get(self.pr_num) def get_pr_file_content(self, file_path: str, branch: str) -> str: try: if branch == self.pr.source_branch: branch = self.pr.data["source"]["commit"]["hash"] elif branch == self.pr.destination_branch: branch = self.pr.data["destination"]["commit"]["hash"] url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/" f"{branch}/{file_path}") response = requests.request("GET", url, headers=self.headers) if response.status_code == 404: # not found return "" contents = response.text return contents except Exception: return "" def create_or_update_pr_file(self, file_path: str, branch: str, contents="", message="") -> None: url = (f"https://api.bitbucket.org/2.0/repositories/{self.workspace_slug}/{self.repo_slug}/src/") if not message: if contents: message = f"Update {file_path}" else: message = f"Create {file_path}" files = {file_path: contents} data = { "message": message, "branch": branch } headers = {'Authorization': self.headers['Authorization']} if 'Authorization' in self.headers else {} try: requests.request("POST", url, headers=headers, data=data, files=files) except Exception: get_logger().exception(f"Failed to create empty file {file_path} in branch {branch}") def _get_pr_file_content(self, remote_link: str): try: response = requests.request("GET", remote_link, headers=self.headers) if response.status_code == 404: # not found return "" contents = response.text return contents except Exception: return "" def get_commit_messages(self): return "" # not implemented yet # bitbucket does not support labels def publish_description(self, pr_title: str, description: str): payload = json.dumps({ "description": description, "title": pr_title }) response = requests.request("PUT", self.bitbucket_pull_request_api_url, headers=self.headers, data=payload) try: if response.status_code != 200: get_logger().info(f"Failed to update description, error code: {response.status_code}") except: pass return response # bitbucket does not support labels def publish_labels(self, pr_types: list): pass # bitbucket does not support labels def get_pr_labels(self, update=False): pass #Clone related def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None: if "bitbucket.org" not in repo_url_to_clone: get_logger().error("Repo URL is not a valid bitbucket URL.") return None (scheme, base_url) = repo_url_to_clone.split("bitbucket.org") if not all([scheme, base_url]): get_logger().error(f"repo_url_to_clone: {repo_url_to_clone} is not a valid bitbucket URL.") return None if self.auth_type == "basic": # Basic auth with token clone_url = f"{scheme}x-token-auth:{self.basic_token}@bitbucket.org{base_url}" elif self.auth_type == "bearer": # Bearer token clone_url = f"{scheme}x-token-auth:{self.bearer_token}@bitbucket.org{base_url}" else: # This case should ideally not be reached if __init__ validates auth_type get_logger().error(f"Unsupported or uninitialized auth_type: {getattr(self, 'auth_type', 'N/A')}. Returning None") return None return clone_url ================================================ FILE: pr_agent/git_providers/bitbucket_server_provider.py ================================================ import difflib import re from packaging.version import parse as parse_version from typing import Optional, Tuple from urllib.parse import quote_plus, urlparse from atlassian.bitbucket import Bitbucket from requests.exceptions import HTTPError import shlex import subprocess from ..algo.file_filter import filter_ignored from ..algo.git_patch_processing import decode_if_bytes from ..algo.language_handler import is_valid_file from ..algo.types import EDIT_TYPE, FilePatchInfo from ..algo.utils import (find_line_number_of_relevant_line_in_file, load_large_diff) from ..config_loader import get_settings from ..log import get_logger from .git_provider import GitProvider, get_git_ssl_env class BitbucketServerProvider(GitProvider): def __init__( self, pr_url: Optional[str] = None, incremental: Optional[bool] = False, bitbucket_client: Optional[Bitbucket] = None, ): self.bitbucket_server_url = None self.workspace_slug = None self.repo_slug = None self.repo = None self.pr_num = None self.pr = None self.pr_url = pr_url self.temp_comments = [] self.incremental = incremental self.diff_files = None self.bitbucket_pull_request_api_url = pr_url self.bearer_token = get_settings().get("BITBUCKET_SERVER.BEARER_TOKEN", None) # Get username and password from settings username = get_settings().get("BITBUCKET_SERVER.USERNAME", None) password = get_settings().get("BITBUCKET_SERVER.PASSWORD", None) if bitbucket_client: # if Bitbucket client is provided, use it self.bitbucket_client = bitbucket_client self.bitbucket_server_url = getattr(bitbucket_client, 'url', None) or self._parse_bitbucket_server(pr_url) else: self.bitbucket_server_url = self._parse_bitbucket_server(pr_url) if not self.bitbucket_server_url: raise ValueError("Invalid or missing Bitbucket Server URL parsed from PR URL.") if self.bearer_token: # if bearer token is provided, use it self.bitbucket_client = Bitbucket( url=self.bitbucket_server_url, token=self.bearer_token ) else: # otherwise use username and password self.bitbucket_client = Bitbucket( url=self.bitbucket_server_url, username=username, password=password ) try: self.bitbucket_api_version = parse_version(self.bitbucket_client.get("rest/api/1.0/application-properties").get('version')) except Exception: self.bitbucket_api_version = None if pr_url: self.set_pr(pr_url) def get_git_repo_url(self, pr_url: str=None) -> str: #bitbucket server does not support issue url, so ignore param try: parsed_url = urlparse(self.pr_url) return f"{parsed_url.scheme}://{parsed_url.netloc}/scm/{self.workspace_slug.lower()}/{self.repo_slug.lower()}.git" except Exception as e: get_logger().exception(f"url is not a valid merge requests url: {self.pr_url}") return "" # Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo. # Example: https://bitbucket.dev.my_inc.com/scm/my_work/my_repo.git and branch: my_branch -> prefix: "https://bitbucket.dev.my_inc.com/projects/MY_WORK/repos/my_repo/browse/src", suffix: "?at=refs%2Fheads%2Fmy_branch" # In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix. def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]: workspace_name = None project_name = None if not repo_git_url: workspace_name = self.workspace_slug project_name = self.repo_slug default_branch_dict = self.bitbucket_client.get_default_branch(workspace_name, project_name) if 'displayId' in default_branch_dict: desired_branch = default_branch_dict['displayId'] else: get_logger().error(f"Cannot obtain default branch for workspace_name={workspace_name}, " f"project_name={project_name}, default_branch_dict={default_branch_dict}") return ("", "") elif '.git' in repo_git_url and 'scm/' in repo_git_url: repo_path = repo_git_url.split('.git')[0].split('scm/')[-1] if repo_path.count('/') == 1: # Has to have the form / workspace_name, project_name = repo_path.split('/') if not workspace_name or not project_name: get_logger().error(f"workspace_name or project_name not found in context, either git url: {repo_git_url} or uninitialized workspace/project.") return ("", "") prefix = f"{self.bitbucket_server_url}/projects/{workspace_name}/repos/{project_name}/browse" suffix = f"?at=refs%2Fheads%2F{desired_branch}" return (prefix, suffix) def get_repo_settings(self): try: content = self.bitbucket_client.get_content_of_file(self.workspace_slug, self.repo_slug, ".pr_agent.toml") return content except Exception as e: if isinstance(e, HTTPError): if e.response.status_code == 404: # not found return "" get_logger().error(f"Failed to load .pr_agent.toml file, error: {e}") return "" def get_pr_id(self): return self.pr_num def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ post_parameters_list = [] for suggestion in code_suggestions: body = suggestion["body"] original_suggestion = suggestion.get('original_suggestion', None) # needed for diff code if original_suggestion: try: existing_code = original_suggestion['existing_code'].rstrip() + "\n" improved_code = original_suggestion['improved_code'].rstrip() + "\n" diff = difflib.unified_diff(existing_code.split('\n'), improved_code.split('\n'), n=999) patch_orig = "\n".join(diff) patch = "\n".join(patch_orig.splitlines()[5:]).strip('\n') diff_code = f"\n\n```diff\n{patch.rstrip()}\n```" # replace ```suggestion ... ``` with diff_code, using regex: body = re.sub(r'```suggestion.*?```', diff_code, body, flags=re.DOTALL) except Exception as e: get_logger().exception(f"Bitbucket failed to get diff code for publishing, error: {e}") continue relevant_file = suggestion["relevant_file"] relevant_lines_start = suggestion["relevant_lines_start"] relevant_lines_end = suggestion["relevant_lines_end"] if not relevant_lines_start or relevant_lines_start == -1: get_logger().warning( f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}" ) continue if relevant_lines_end < relevant_lines_start: get_logger().warning( f"Failed to publish code suggestion, " f"relevant_lines_end is {relevant_lines_end} and " f"relevant_lines_start is {relevant_lines_start}" ) continue if relevant_lines_end > relevant_lines_start: # Bitbucket does not support multi-line suggestions so use a code block instead - https://jira.atlassian.com/browse/BSERV-4553 body = body.replace("```suggestion", "```") post_parameters = { "body": body, "path": relevant_file, "line": relevant_lines_end, "start_line": relevant_lines_start, "start_side": "RIGHT", } else: # API is different for single line comments post_parameters = { "body": body, "path": relevant_file, "line": relevant_lines_start, "side": "RIGHT", } post_parameters_list.append(post_parameters) try: self.publish_inline_comments(post_parameters_list) return True except Exception as e: if get_settings().config.verbosity_level >= 2: get_logger().error(f"Failed to publish code suggestion, error: {e}") return False def publish_file_comments(self, file_comments: list) -> bool: pass def is_supported(self, capability: str) -> bool: if capability in ['get_issue_comments', 'get_labels', 'gfm_markdown', 'publish_file_comments']: return False return True def set_pr(self, pr_url: str): self.workspace_slug, self.repo_slug, self.pr_num = self._parse_pr_url(pr_url) self.pr = self._get_pr() def get_file(self, path: str, commit_id: str): file_content = "" try: file_content = self.bitbucket_client.get_content_of_file(self.workspace_slug, self.repo_slug, path, commit_id) except HTTPError as e: get_logger().debug(f"File {path} not found at commit id: {commit_id}") return file_content def get_files(self): changes = self.bitbucket_client.get_pull_requests_changes(self.workspace_slug, self.repo_slug, self.pr_num) diffstat = [change["path"]['toString'] for change in changes] return diffstat #gets the best common ancestor: https://git-scm.com/docs/git-merge-base @staticmethod def get_best_common_ancestor(source_commits_list, destination_commits_list, guaranteed_common_ancestor) -> str: destination_commit_hashes = {commit['id'] for commit in destination_commits_list} | {guaranteed_common_ancestor} for commit in source_commits_list: for parent_commit in commit['parents']: if parent_commit['id'] in destination_commit_hashes: return parent_commit['id'] return guaranteed_common_ancestor def get_diff_files(self) -> list[FilePatchInfo]: if self.diff_files: return self.diff_files head_sha = self.pr.fromRef['latestCommit'] # if Bitbucket api version is >= 8.16 then use the merge-base api for 2-way diff calculation if self.bitbucket_api_version is not None and self.bitbucket_api_version >= parse_version("8.16"): try: base_sha = self.bitbucket_client.get(self._get_merge_base())['id'] except Exception as e: get_logger().error(f"Failed to get the best common ancestor for PR: {self.pr_url}, \nerror: {e}") raise e else: source_commits_list = list(self.bitbucket_client.get_pull_requests_commits( self.workspace_slug, self.repo_slug, self.pr_num )) # if Bitbucket api version is None or < 7.0 then do a simple diff with a guaranteed common ancestor base_sha = source_commits_list[-1]['parents'][0]['id'] # if Bitbucket api version is 7.0-8.15 then use 2-way diff functionality for the base_sha if self.bitbucket_api_version is not None and self.bitbucket_api_version >= parse_version("7.0"): try: destination_commits = list( self.bitbucket_client.get_commits(self.workspace_slug, self.repo_slug, base_sha, self.pr.toRef['latestCommit'])) base_sha = self.get_best_common_ancestor(source_commits_list, destination_commits, base_sha) except Exception as e: get_logger().error( f"Failed to get the commit list for calculating best common ancestor for PR: {self.pr_url}, \nerror: {e}") raise e diff_files = [] original_file_content_str = "" new_file_content_str = "" changes_original = list(self.bitbucket_client.get_pull_requests_changes(self.workspace_slug, self.repo_slug, self.pr_num)) changes = filter_ignored(changes_original, 'bitbucket_server') for change in changes: file_path = change['path']['toString'] if not is_valid_file(file_path.split("/")[-1]): get_logger().info(f"Skipping a non-code file: {file_path}") continue match change['type']: case 'ADD': edit_type = EDIT_TYPE.ADDED new_file_content_str = self.get_file(file_path, head_sha) new_file_content_str = decode_if_bytes(new_file_content_str) original_file_content_str = "" case 'DELETE': edit_type = EDIT_TYPE.DELETED new_file_content_str = "" original_file_content_str = self.get_file(file_path, base_sha) original_file_content_str = decode_if_bytes(original_file_content_str) case 'RENAME': edit_type = EDIT_TYPE.RENAMED case _: edit_type = EDIT_TYPE.MODIFIED original_file_content_str = self.get_file(file_path, base_sha) original_file_content_str = decode_if_bytes(original_file_content_str) new_file_content_str = self.get_file(file_path, head_sha) new_file_content_str = decode_if_bytes(new_file_content_str) patch = load_large_diff(file_path, new_file_content_str, original_file_content_str, show_warning=False) diff_files.append( FilePatchInfo( original_file_content_str, new_file_content_str, patch, file_path, edit_type=edit_type, ) ) self.diff_files = diff_files return diff_files def publish_comment(self, pr_comment: str, is_temporary: bool = False): if not is_temporary: self.bitbucket_client.add_pull_request_comment(self.workspace_slug, self.repo_slug, self.pr_num, pr_comment) def remove_initial_comment(self): try: for comment in self.temp_comments: self.remove_comment(comment) except ValueError as e: get_logger().exception(f"Failed to remove temp comments, error: {e}") def remove_comment(self, comment): pass # function to create_inline_comment def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, absolute_position: int = None): position, absolute_position = find_line_number_of_relevant_line_in_file( self.get_diff_files(), relevant_file.strip('`'), relevant_line_in_file, absolute_position ) if position == -1: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") subject_type = "FILE" else: subject_type = "LINE" path = relevant_file.strip() return dict(body=body, path=path, position=absolute_position) if subject_type == "LINE" else {} def publish_inline_comment(self, comment: str, from_line: int, file: str, original_suggestion=None): payload = { "text": comment, "severity": "NORMAL", "anchor": { "diffType": "EFFECTIVE", "path": file, "lineType": "ADDED", "line": from_line, "fileType": "TO" } } try: self.bitbucket_client.post(self._get_pr_comments_path(), data=payload) except Exception as e: get_logger().error(f"Failed to publish inline comment to '{file}' at line {from_line}, error: {e}") raise e def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: if relevant_line_start == -1: link = f"{self.pr_url}/diff#{quote_plus(relevant_file)}" else: link = f"{self.pr_url}/diff#{quote_plus(relevant_file)}?t={relevant_line_start}" return link def generate_link_to_relevant_line_number(self, suggestion) -> str: try: relevant_file = suggestion['relevant_file'].strip('`').strip("'").rstrip() relevant_line_str = suggestion['relevant_line'].rstrip() if not relevant_line_str: return "" diff_files = self.get_diff_files() position, absolute_position = find_line_number_of_relevant_line_in_file \ (diff_files, relevant_file, relevant_line_str) if absolute_position != -1: if self.pr: link = f"{self.pr_url}/diff#{quote_plus(relevant_file)}?t={absolute_position}" return link else: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Failed adding line link to '{relevant_file}' since PR not set") else: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Failed adding line link to '{relevant_file}' since position not found") if absolute_position != -1 and self.pr_url: link = f"{self.pr_url}/diff#{quote_plus(relevant_file)}?t={absolute_position}" return link except Exception as e: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Failed adding line link to '{relevant_file}', error: {e}") return "" def publish_inline_comments(self, comments: list[dict]): for comment in comments: if 'position' in comment: self.publish_inline_comment(comment['body'], comment['position'], comment['path']) elif 'start_line' in comment: # multi-line comment # note that bitbucket does not seem to support range - only a comment on a single line - https://community.developer.atlassian.com/t/api-post-endpoint-for-inline-pull-request-comments/60452 self.publish_inline_comment(comment['body'], comment['start_line'], comment['path']) elif 'line' in comment: # single-line comment self.publish_inline_comment(comment['body'], comment['line'], comment['path']) else: get_logger().error(f"Could not publish inline comment: {comment}") def get_title(self): return self.pr.title def get_languages(self): return {"yaml": 0} # devops LOL def get_pr_branch(self): return self.pr.fromRef['displayId'] def get_pr_owner_id(self) -> str | None: return self.workspace_slug def get_pr_description_full(self): if hasattr(self.pr, "description"): return self.pr.description else: return None def get_user_id(self): return 0 def get_issue_comments(self): raise NotImplementedError( "Bitbucket provider does not support issue comments yet" ) def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: return True def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: return True @staticmethod def _parse_bitbucket_server(url: str) -> str: # pr url format: f"{bitbucket_server}/projects/{project_name}/repos/{repository_name}/pull-requests/{pr_id}" parsed_url = urlparse(url) server_path = parsed_url.path.split("/projects/") if len(server_path) > 1: server_path = server_path[0].strip("/") return f"{parsed_url.scheme}://{parsed_url.netloc}/{server_path}".strip("/") return f"{parsed_url.scheme}://{parsed_url.netloc}" @staticmethod def _parse_pr_url(pr_url: str) -> Tuple[str, str, int]: # pr url format: f"{bitbucket_server}/projects/{project_name}/repos/{repository_name}/pull-requests/{pr_id}" parsed_url = urlparse(pr_url) path_parts = parsed_url.path.strip("/").split("/") try: projects_index = path_parts.index("projects") except ValueError: projects_index = -1 try: users_index = path_parts.index("users") except ValueError: users_index = -1 if projects_index == -1 and users_index == -1: raise ValueError(f"The provided URL '{pr_url}' does not appear to be a Bitbucket PR URL") if projects_index != -1: path_parts = path_parts[projects_index:] else: path_parts = path_parts[users_index:] if len(path_parts) < 6 or path_parts[2] != "repos" or path_parts[4] != "pull-requests": raise ValueError( f"The provided URL '{pr_url}' does not appear to be a Bitbucket PR URL" ) workspace_slug = path_parts[1] if users_index != -1: workspace_slug = f"~{workspace_slug}" repo_slug = path_parts[3] try: pr_number = int(path_parts[5]) except ValueError as e: raise ValueError(f"Unable to convert PR number '{path_parts[5]}' to integer") from e return workspace_slug, repo_slug, pr_number def _get_repo(self): if self.repo is None: self.repo = self.bitbucket_client.get_repo(self.workspace_slug, self.repo_slug) return self.repo def _get_pr(self): try: pr = self.bitbucket_client.get_pull_request(self.workspace_slug, self.repo_slug, pull_request_id=self.pr_num) return type('new_dict', (object,), pr) except Exception as e: get_logger().error(f"Failed to get pull request, error: {e}") raise e def _get_pr_file_content(self, remote_link: str): return "" def get_commit_messages(self): return "" # bitbucket does not support labels def publish_description(self, pr_title: str, description: str): payload = { "version": self.pr.version, "description": description, "title": pr_title, "reviewers": self.pr.reviewers # needs to be sent otherwise gets wiped } try: self.bitbucket_client.update_pull_request(self.workspace_slug, self.repo_slug, str(self.pr_num), payload) except Exception as e: get_logger().error(f"Failed to update pull request, error: {e}") raise e # bitbucket does not support labels def publish_labels(self, pr_types: list): pass # bitbucket does not support labels def get_pr_labels(self, update=False): pass def _get_pr_comments_path(self): return f"rest/api/latest/projects/{self.workspace_slug}/repos/{self.repo_slug}/pull-requests/{self.pr_num}/comments" def _get_merge_base(self): return f"rest/api/latest/projects/{self.workspace_slug}/repos/{self.repo_slug}/pull-requests/{self.pr_num}/merge-base" # Clone related def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None: if 'bitbucket.' not in repo_url_to_clone: get_logger().error("Repo URL is not a valid bitbucket URL.") return None bearer_token = self.bearer_token if not bearer_token: get_logger().error("No bearer token provided. Returning None") return None # Return unmodified URL as the token is passed via HTTP headers in _clone_inner, as seen below. return repo_url_to_clone #Overriding the shell command, since for some reason usage of x-token-auth doesn't work, as mentioned here: # https://stackoverflow.com/questions/56760396/cloning-bitbucket-server-repo-with-access-tokens def _clone_inner(self, repo_url: str, dest_folder: str, operation_timeout_in_seconds: int=None): bearer_token = self.bearer_token if not bearer_token: #Shouldn't happen since this is checked in _prepare_clone, therefore - throwing an exception. raise RuntimeError(f"Bearer token is required!") cli_args = shlex.split(f"git clone -c http.extraHeader='Authorization: Bearer {bearer_token}' " f"--filter=blob:none --depth 1 {repo_url} {dest_folder}") ssl_env = get_git_ssl_env() subprocess.run(cli_args, env=ssl_env, check=True, # check=True will raise an exception if the command fails stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=operation_timeout_in_seconds) ================================================ FILE: pr_agent/git_providers/codecommit_client.py ================================================ import boto3 import botocore class CodeCommitDifferencesResponse: """ CodeCommitDifferencesResponse is the response object returned from our get_differences() function. It maps the JSON response to member variables of this class. """ def __init__(self, json: dict): before_blob = json.get("beforeBlob", {}) after_blob = json.get("afterBlob", {}) self.before_blob_id = before_blob.get("blobId", "") self.before_blob_path = before_blob.get("path", "") self.after_blob_id = after_blob.get("blobId", "") self.after_blob_path = after_blob.get("path", "") self.change_type = json.get("changeType", "") class CodeCommitPullRequestResponse: """ CodeCommitPullRequestResponse is the response object returned from our get_pr() function. It maps the JSON response to member variables of this class. """ def __init__(self, json: dict): self.title = json.get("title", "") self.description = json.get("description", "") self.targets = [] for target in json.get("pullRequestTargets", []): self.targets.append(CodeCommitPullRequestResponse.CodeCommitPullRequestTarget(target)) class CodeCommitPullRequestTarget: """ CodeCommitPullRequestTarget is a subclass of CodeCommitPullRequestResponse that holds details about an individual target commit. """ def __init__(self, json: dict): self.source_commit = json.get("sourceCommit", "") self.source_branch = json.get("sourceReference", "") self.destination_commit = json.get("destinationCommit", "") self.destination_branch = json.get("destinationReference", "") class CodeCommitClient: """ CodeCommitClient is a wrapper around the AWS boto3 SDK for the CodeCommit client """ def __init__(self): self.boto_client = None def is_supported(self, capability: str) -> bool: if capability in ["gfm_markdown"]: return False return True def _connect_boto_client(self): try: self.boto_client = boto3.client("codecommit") except Exception as e: raise ValueError(f"Failed to connect to AWS CodeCommit: {e}") from e def get_differences(self, repo_name: int, destination_commit: str, source_commit: str): """ Get the differences between two commits in CodeCommit. Args: - repo_name: Name of the repository - destination_commit: Commit hash you want to merge into (the "before" hash) (usually on the main or master branch) - source_commit: Commit hash of the code you are adding (the "after" branch) Returns: - List of CodeCommitDifferencesResponse objects Boto3 Documentation: - aws codecommit get-differences - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_differences.html """ if self.boto_client is None: self._connect_boto_client() # The differences response from AWS is paginated, so we need to iterate through the pages to get all the differences. differences = [] try: paginator = self.boto_client.get_paginator("get_differences") for page in paginator.paginate( repositoryName=repo_name, beforeCommitSpecifier=destination_commit, afterCommitSpecifier=source_commit, ): differences.extend(page.get("differences", [])) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': raise ValueError(f"CodeCommit cannot retrieve differences: Repository does not exist: {repo_name}") from e raise ValueError(f"CodeCommit cannot retrieve differences for {source_commit}..{destination_commit}") from e except Exception as e: raise ValueError(f"CodeCommit cannot retrieve differences for {source_commit}..{destination_commit}") from e output = [] for json in differences: output.append(CodeCommitDifferencesResponse(json)) return output def get_file(self, repo_name: str, file_path: str, sha_hash: str, optional: bool = False): """ Retrieve a file from CodeCommit. Args: - repo_name: Name of the repository - file_path: Path to the file you are retrieving - sha_hash: Commit hash of the file you are retrieving Returns: - File contents Boto3 Documentation: - aws codecommit get_file - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_file.html """ if not file_path: return "" if self.boto_client is None: self._connect_boto_client() try: response = self.boto_client.get_file(repositoryName=repo_name, commitSpecifier=sha_hash, filePath=file_path) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': raise ValueError(f"CodeCommit cannot retrieve PR: Repository does not exist: {repo_name}") from e # if the file does not exist, but is flagged as optional, then return an empty string if optional and e.response["Error"]["Code"] == 'FileDoesNotExistException': return "" raise ValueError(f"CodeCommit cannot retrieve file '{file_path}' from repository '{repo_name}'") from e except Exception as e: raise ValueError(f"CodeCommit cannot retrieve file '{file_path}' from repository '{repo_name}'") from e if "fileContent" not in response: raise ValueError(f"File content is empty for file: {file_path}") return response.get("fileContent", "") def get_pr(self, repo_name: str, pr_number: int): """ Get a information about a CodeCommit PR. Args: - repo_name: Name of the repository - pr_number: The PR number you are requesting Returns: - CodeCommitPullRequestResponse object Boto3 Documentation: - aws codecommit get_pull_request - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/get_pull_request.html """ if self.boto_client is None: self._connect_boto_client() try: response = self.boto_client.get_pull_request(pullRequestId=str(pr_number)) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == 'PullRequestDoesNotExistException': raise ValueError(f"CodeCommit cannot retrieve PR: PR number does not exist: {pr_number}") from e if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': raise ValueError(f"CodeCommit cannot retrieve PR: Repository does not exist: {repo_name}") from e raise ValueError(f"CodeCommit cannot retrieve PR: {pr_number}: boto client error") from e except Exception as e: raise ValueError(f"CodeCommit cannot retrieve PR: {pr_number}") from e if "pullRequest" not in response: raise ValueError("CodeCommit PR number not found: {pr_number}") return CodeCommitPullRequestResponse(response.get("pullRequest", {})) def publish_description(self, pr_number: int, pr_title: str, pr_body: str): """ Set the title and description on a pull request Args: - pr_number: the AWS CodeCommit pull request number - pr_title: title of the pull request - pr_body: body of the pull request Returns: - None Boto3 Documentation: - aws codecommit update_pull_request_title - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/update_pull_request_title.html - aws codecommit update_pull_request_description - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/update_pull_request_description.html """ if self.boto_client is None: self._connect_boto_client() try: self.boto_client.update_pull_request_title(pullRequestId=str(pr_number), title=pr_title) self.boto_client.update_pull_request_description(pullRequestId=str(pr_number), description=pr_body) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == 'PullRequestDoesNotExistException': raise ValueError(f"PR number does not exist: {pr_number}") from e if e.response["Error"]["Code"] == 'InvalidTitleException': raise ValueError(f"Invalid title for PR number: {pr_number}") from e if e.response["Error"]["Code"] == 'InvalidDescriptionException': raise ValueError(f"Invalid description for PR number: {pr_number}") from e if e.response["Error"]["Code"] == 'PullRequestAlreadyClosedException': raise ValueError(f"PR is already closed: PR number: {pr_number}") from e raise ValueError(f"Boto3 client error calling publish_description") from e except Exception as e: raise ValueError(f"Error calling publish_description") from e def publish_comment(self, repo_name: str, pr_number: int, destination_commit: str, source_commit: str, comment: str, annotation_file: str = None, annotation_line: int = None): """ Publish a comment to a pull request Args: - repo_name: name of the repository - pr_number: number of the pull request - destination_commit: The commit hash you want to merge into (the "before" hash) (usually on the main or master branch) - source_commit: The commit hash of the code you are adding (the "after" branch) - comment: The comment you want to publish - annotation_file: The file you want to annotate (optional) - annotation_line: The line number you want to annotate (optional) Comment annotations for CodeCommit are different than GitHub. CodeCommit only designates the starting line number for the comment. It does not support the ending line number to highlight a range of lines. Returns: - None Boto3 Documentation: - aws codecommit post_comment_for_pull_request - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/post_comment_for_pull_request.html """ if self.boto_client is None: self._connect_boto_client() try: # If the comment has code annotations, # then set the file path and line number in the location dictionary if annotation_file and annotation_line: self.boto_client.post_comment_for_pull_request( pullRequestId=str(pr_number), repositoryName=repo_name, beforeCommitId=destination_commit, afterCommitId=source_commit, content=comment, location={ "filePath": annotation_file, "filePosition": annotation_line, "relativeFileVersion": "AFTER", }, ) else: # The comment does not have code annotations self.boto_client.post_comment_for_pull_request( pullRequestId=str(pr_number), repositoryName=repo_name, beforeCommitId=destination_commit, afterCommitId=source_commit, content=comment, ) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == 'RepositoryDoesNotExistException': raise ValueError(f"Repository does not exist: {repo_name}") from e if e.response["Error"]["Code"] == 'PullRequestDoesNotExistException': raise ValueError(f"PR number does not exist: {pr_number}") from e raise ValueError(f"Boto3 client error calling post_comment_for_pull_request") from e except Exception as e: raise ValueError(f"Error calling post_comment_for_pull_request") from e ================================================ FILE: pr_agent/git_providers/codecommit_provider.py ================================================ import os import re from collections import Counter from typing import List, Optional, Tuple from urllib.parse import urlparse from pr_agent.algo.language_handler import is_valid_file from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from pr_agent.git_providers.codecommit_client import CodeCommitClient from ..algo.utils import load_large_diff from ..config_loader import get_settings from ..log import get_logger from .git_provider import GitProvider class PullRequestCCMimic: """ This class mimics the PullRequest class from the PyGithub library for the CodeCommitProvider. """ def __init__(self, title: str, diff_files: List[FilePatchInfo]): self.title = title self.diff_files = diff_files self.description = None self.source_commit = None self.source_branch = None # the branch containing your new code changes self.destination_commit = None self.destination_branch = None # the branch you are going to merge into class CodeCommitFile: """ This class represents a file in a pull request in CodeCommit. """ def __init__( self, a_path: str, a_blob_id: str, b_path: str, b_blob_id: str, edit_type: EDIT_TYPE, ): self.a_path = a_path self.a_blob_id = a_blob_id self.b_path = b_path self.b_blob_id = b_blob_id self.edit_type: EDIT_TYPE = edit_type self.filename = b_path if b_path else a_path class CodeCommitProvider(GitProvider): """ This class implements the GitProvider interface for AWS CodeCommit repositories. """ def __init__(self, pr_url: Optional[str] = None, incremental: Optional[bool] = False): self.codecommit_client = CodeCommitClient() self.aws_client = None self.repo_name = None self.pr_num = None self.pr = None self.diff_files = None self.git_files = None self.pr_url = pr_url if pr_url: self.set_pr(pr_url) def provider_name(self): return "CodeCommit" def is_supported(self, capability: str) -> bool: if capability in [ "get_issue_comments", "create_inline_comment", "publish_inline_comments", "get_labels", "gfm_markdown" ]: return False return True def set_pr(self, pr_url: str): self.repo_name, self.pr_num = self._parse_pr_url(pr_url) self.pr = self._get_pr() def get_files(self) -> list[CodeCommitFile]: # bring files from CodeCommit only once if self.git_files: return self.git_files self.git_files = [] differences = self.codecommit_client.get_differences(self.repo_name, self.pr.destination_commit, self.pr.source_commit) for item in differences: self.git_files.append(CodeCommitFile(item.before_blob_path, item.before_blob_id, item.after_blob_path, item.after_blob_id, CodeCommitProvider._get_edit_type(item.change_type))) return self.git_files def get_diff_files(self) -> list[FilePatchInfo]: """ Retrieves the list of files that have been modified, added, deleted, or renamed in a pull request in CodeCommit, along with their content and patch information. Returns: diff_files (List[FilePatchInfo]): List of FilePatchInfo objects representing the modified, added, deleted, or renamed files in the merge request. """ # bring files from CodeCommit only once if self.diff_files: return self.diff_files self.diff_files = [] files = self.get_files() for diff_item in files: patch_filename = "" if diff_item.a_blob_id is not None: patch_filename = diff_item.a_path original_file_content_str = self.codecommit_client.get_file( self.repo_name, diff_item.a_path, self.pr.destination_commit) if isinstance(original_file_content_str, (bytes, bytearray)): original_file_content_str = original_file_content_str.decode("utf-8") else: original_file_content_str = "" if diff_item.b_blob_id is not None: patch_filename = diff_item.b_path new_file_content_str = self.codecommit_client.get_file(self.repo_name, diff_item.b_path, self.pr.source_commit) if isinstance(new_file_content_str, (bytes, bytearray)): new_file_content_str = new_file_content_str.decode("utf-8") else: new_file_content_str = "" patch = load_large_diff(patch_filename, new_file_content_str, original_file_content_str) # Store the diffs as a list of FilePatchInfo objects info = FilePatchInfo( original_file_content_str, new_file_content_str, patch, diff_item.b_path, edit_type=diff_item.edit_type, old_filename=None if diff_item.a_path == diff_item.b_path else diff_item.a_path, ) # Only add valid files to the diff list # "bad extensions" are set in the language_extensions.toml file # a "valid file" is one that is not in the "bad extensions" list if is_valid_file(info.filename): self.diff_files.append(info) return self.diff_files def publish_description(self, pr_title: str, pr_body: str): try: self.codecommit_client.publish_description( pr_number=self.pr_num, pr_title=pr_title, pr_body=CodeCommitProvider._add_additional_newlines(pr_body), ) except Exception as e: raise ValueError(f"CodeCommit Cannot publish description for PR: {self.pr_num}") from e def publish_comment(self, pr_comment: str, is_temporary: bool = False): if is_temporary: get_logger().info(pr_comment) return pr_comment = CodeCommitProvider._remove_markdown_html(pr_comment) pr_comment = CodeCommitProvider._add_additional_newlines(pr_comment) try: self.codecommit_client.publish_comment( repo_name=self.repo_name, pr_number=self.pr_num, destination_commit=self.pr.destination_commit, source_commit=self.pr.source_commit, comment=pr_comment, ) except Exception as e: raise ValueError(f"CodeCommit Cannot publish comment for PR: {self.pr_num}") from e def publish_code_suggestions(self, code_suggestions: list) -> bool: counter = 1 for suggestion in code_suggestions: # Verify that each suggestion has the required keys if not all(key in suggestion for key in ["body", "relevant_file", "relevant_lines_start"]): get_logger().warning(f"Skipping code suggestion #{counter}: Each suggestion must have 'body', 'relevant_file', 'relevant_lines_start' keys") continue # Publish the code suggestion to CodeCommit try: get_logger().debug(f"Code Suggestion #{counter} in file: {suggestion['relevant_file']}: {suggestion['relevant_lines_start']}") self.codecommit_client.publish_comment( repo_name=self.repo_name, pr_number=self.pr_num, destination_commit=self.pr.destination_commit, source_commit=self.pr.source_commit, comment=suggestion["body"], annotation_file=suggestion["relevant_file"], annotation_line=suggestion["relevant_lines_start"], ) except Exception as e: raise ValueError(f"CodeCommit Cannot publish code suggestions for PR: {self.pr_num}") from e counter += 1 # The calling function passes in a list of code suggestions, and this function publishes each suggestion one at a time. # If we were to return False here, the calling function will attempt to publish the same list of code suggestions again, one at a time. # Since this function publishes the suggestions one at a time anyway, we always return True here to avoid the retry. return True def publish_labels(self, labels): return [""] # not implemented yet def get_pr_labels(self, update=False): return [""] # not implemented yet def remove_initial_comment(self): return "" # not implemented yet def remove_comment(self, comment): return "" # not implemented yet def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/codecommit/client/post_comment_for_compared_commit.html raise NotImplementedError("CodeCommit provider does not support publishing inline comments yet") def publish_inline_comments(self, comments: list[dict]): raise NotImplementedError("CodeCommit provider does not support publishing inline comments yet") def get_title(self): return self.pr.title def get_pr_id(self): """ Returns the PR ID in the format: "repo_name/pr_number". Note: This is an internal identifier for PR-Agent, and is not the same as the CodeCommit PR identifier. """ try: pr_id = f"{self.repo_name}/{self.pr_num}" return pr_id except: return "" def get_languages(self): """ Returns a dictionary of languages, containing the percentage of each language used in the PR. Returns: - dict: A dictionary where each key is a language name and the corresponding value is the percentage of that language in the PR. """ commit_files = self.get_files() filenames = [ item.filename for item in commit_files ] extensions = CodeCommitProvider._get_file_extensions(filenames) # Calculate the percentage of each file extension in the PR percentages = CodeCommitProvider._get_language_percentages(extensions) # The global language_extension_map is a dictionary of languages, # where each dictionary item is a BoxList of extensions. # We want a dictionary of extensions, # where each dictionary item is a language name. # We build that language->extension dictionary here in main_extensions_flat. main_extensions_flat = {} language_extension_map_org = get_settings().language_extension_map_org language_extension_map = {k.lower(): v for k, v in language_extension_map_org.items()} for language, extensions in language_extension_map.items(): for ext in extensions: main_extensions_flat[ext] = language # Map the file extension/languages to percentages languages = {} for ext, pct in percentages.items(): languages[main_extensions_flat.get(ext, "")] = pct return languages def get_pr_branch(self): return self.pr.source_branch def get_pr_description_full(self) -> str: return self.pr.description def get_user_id(self): return -1 # not implemented yet def get_issue_comments(self): raise NotImplementedError("CodeCommit provider does not support issue comments yet") def get_repo_settings(self): # a local ".pr_agent.toml" settings file is optional settings_filename = ".pr_agent.toml" return self.codecommit_client.get_file(self.repo_name, settings_filename, self.pr.source_commit, optional=True) def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: get_logger().info("CodeCommit provider does not support eyes reaction yet") return True def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: get_logger().info("CodeCommit provider does not support removing reactions yet") return True @staticmethod def _parse_pr_url(pr_url: str) -> Tuple[str, int]: """ Parse the CodeCommit PR URL and return the repository name and PR number. Args: - pr_url: the full AWS CodeCommit pull request URL Returns: - Tuple[str, int]: A tuple containing the repository name and PR number. """ # Example PR URL: # https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/__MY_REPO__/pull-requests/123456" parsed_url = urlparse(pr_url) if not CodeCommitProvider._is_valid_codecommit_hostname(parsed_url.netloc): raise ValueError(f"The provided URL is not a valid CodeCommit URL: {pr_url}") path_parts = parsed_url.path.strip("/").split("/") if ( len(path_parts) < 6 or path_parts[0] != "codesuite" or path_parts[1] != "codecommit" or path_parts[2] != "repositories" or path_parts[4] != "pull-requests" ): raise ValueError(f"The provided URL does not appear to be a CodeCommit PR URL: {pr_url}") repo_name = path_parts[3] try: pr_number = int(path_parts[5]) except ValueError as e: raise ValueError(f"Unable to convert PR number to integer: '{path_parts[5]}'") from e return repo_name, pr_number @staticmethod def _is_valid_codecommit_hostname(hostname: str) -> bool: """ Check if the provided hostname is a valid AWS CodeCommit hostname. This is not an exhaustive check of AWS region names, but instead uses a regex to check for matching AWS region patterns. Args: - hostname: the hostname to check Returns: - bool: True if the hostname is valid, False otherwise. """ return re.match(r"^[a-z]{2}-(gov-)?[a-z]+-\d\.console\.aws\.amazon\.com$", hostname) is not None def _get_pr(self): response = self.codecommit_client.get_pr(self.repo_name, self.pr_num) if len(response.targets) == 0: raise ValueError(f"No files found in CodeCommit PR: {self.pr_num}") # TODO: implement support for multiple targets in one CodeCommit PR # for now, we are only using the first target in the PR if len(response.targets) > 1: get_logger().warning( "Multiple targets in one PR is not supported for CodeCommit yet. Continuing, using the first target only..." ) # Return our object that mimics PullRequest class from the PyGithub library # (This strategy was copied from the LocalGitProvider) mimic = PullRequestCCMimic(response.title, self.diff_files) mimic.description = response.description mimic.source_commit = response.targets[0].source_commit mimic.source_branch = response.targets[0].source_branch mimic.destination_commit = response.targets[0].destination_commit mimic.destination_branch = response.targets[0].destination_branch return mimic def get_commit_messages(self): return "" # not implemented yet @staticmethod def _add_additional_newlines(body: str) -> str: """ Replace single newlines in a PR body with double newlines. CodeCommit Markdown does not seem to render as well as GitHub Markdown, so we add additional newlines to the PR body to make it more readable in CodeCommit. Args: - body: the PR body Returns: - str: the PR body with the double newlines added """ return re.sub(r'(? str: """ Remove the HTML tags from a PR comment. CodeCommit Markdown does not seem to render as well as GitHub Markdown, so we remove the HTML tags from the PR comment to make it more readable in CodeCommit. Args: - comment: the PR comment Returns: - str: the PR comment with the HTML tags removed """ comment = comment.replace("
      ", "") comment = comment.replace("
      ", "") comment = comment.replace("", "") comment = comment.replace("", "") return comment @staticmethod def _get_edit_type(codecommit_change_type: str): """ Convert the CodeCommit change type string to the EDIT_TYPE enum. The CodeCommit change type string is returned from the get_differences SDK method. Args: - codecommit_change_type: the CodeCommit change type string Returns: - An EDIT_TYPE enum representing the modified, added, deleted, or renamed file in the PR diff. """ t = codecommit_change_type.upper() edit_type = None if t == "A": edit_type = EDIT_TYPE.ADDED elif t == "D": edit_type = EDIT_TYPE.DELETED elif t == "M": edit_type = EDIT_TYPE.MODIFIED elif t == "R": edit_type = EDIT_TYPE.RENAMED return edit_type @staticmethod def _get_file_extensions(filenames): """ Return a list of file extensions from a list of filenames. The returned extensions will include the dot "." prefix, to accommodate for the dots in the existing language_extension_map settings. Filenames with no extension will return an empty string for the extension. Args: - filenames: a list of filenames Returns: - list: A list of file extensions, including the dot "." prefix. """ extensions = [] for filename in filenames: filename, ext = os.path.splitext(filename) if ext: extensions.append(ext.lower()) else: extensions.append("") return extensions @staticmethod def _get_language_percentages(extensions): """ Return a dictionary containing the programming language name (as the key), and the percentage that language is used (as the value), given a list of file extensions. Args: - extensions: a list of file extensions Returns: - dict: A dictionary where each key is a language name and the corresponding value is the percentage of that language in the PR. """ total_files = len(extensions) if total_files == 0: return {} # Identify language by file extension and count lang_count = Counter(extensions) # Convert counts to percentages lang_percentage = { lang: round(count / total_files * 100) for lang, count in lang_count.items() } return lang_percentage ================================================ FILE: pr_agent/git_providers/gerrit_provider.py ================================================ import json import os import pathlib import shutil import subprocess import uuid from collections import Counter, namedtuple from pathlib import Path from tempfile import NamedTemporaryFile, mkdtemp import requests import urllib3.util from git import Repo from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from pr_agent.config_loader import get_settings from pr_agent.git_providers.git_provider import GitProvider from pr_agent.git_providers.local_git_provider import PullRequestMimic from pr_agent.log import get_logger def _call(*command, **kwargs) -> (int, str, str): res = subprocess.run( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, **kwargs, ) return res.stdout.decode() def clone(url, directory): get_logger().info("Cloning %s to %s", url, directory) stdout = _call('git', 'clone', "--depth", "1", url, directory) get_logger().info(stdout) def fetch(url, refspec, cwd): get_logger().info("Fetching %s %s", url, refspec) stdout = _call( 'git', 'fetch', '--depth', '2', url, refspec, cwd=cwd ) get_logger().info(stdout) def checkout(cwd): get_logger().info("Checking out") stdout = _call('git', 'checkout', "FETCH_HEAD", cwd=cwd) get_logger().info(stdout) def show(*args, cwd=None): get_logger().info("Show") return _call('git', 'show', *args, cwd=cwd) def diff(*args, cwd=None): get_logger().info("Diff") patch = _call('git', 'diff', *args, cwd=cwd) if not patch: get_logger().warning("No changes found") return return patch def reset_local_changes(cwd): get_logger().info("Reset local changes") _call('git', 'checkout', "--force", cwd=cwd) def add_comment(url: urllib3.util.Url, refspec, message): *_, patchset, changenum = refspec.rsplit("/") message = "'" + message.replace("'", "'\"'\"'") + "'" return _call( "ssh", "-p", str(url.port), f"{url.auth}@{url.host}", "gerrit", "review", "--message", message, # "--code-review", score, f"{patchset},{changenum}", ) def list_comments(url: urllib3.util.Url, refspec): *_, patchset, _ = refspec.rsplit("/") stdout = _call( "ssh", "-p", str(url.port), f"{url.auth}@{url.host}", "gerrit", "query", "--comments", "--current-patch-set", patchset, "--format", "JSON", ) change_set, *_ = stdout.splitlines() return json.loads(change_set)["currentPatchSet"]["comments"] def prepare_repo(url: urllib3.util.Url, project, refspec): repo_url = (f"{url.scheme}://{url.auth}@{url.host}:{url.port}/{project}") directory = pathlib.Path(mkdtemp()) clone(repo_url, directory) fetch(repo_url, refspec, cwd=directory) checkout(cwd=directory) return directory def adopt_to_gerrit_message(message): lines = message.splitlines() buf = [] for line in lines: # remove markdown formatting line = (line.replace("*", "") .replace("``", "`") .replace("
      ", "") .replace("
      ", "") .replace("", "") .replace("", "")) line = line.strip() if line.startswith('#'): buf.append("\n" + line.replace('#', '').removesuffix(":").strip() + ":") continue elif line.startswith('-'): buf.append(line.removeprefix('-').strip()) continue else: buf.append(line) return "\n".join(buf).strip() def add_suggestion(src_filename, context: str, start, end: int): with ( NamedTemporaryFile("w", delete=False) as tmp, open(src_filename, "r") as src ): lines = src.readlines() tmp.writelines(lines[:start - 1]) if context: tmp.write(context) tmp.writelines(lines[end:]) shutil.copy(tmp.name, src_filename) os.remove(tmp.name) def upload_patch(patch, path): patch_server_endpoint = get_settings().get( 'gerrit.patch_server_endpoint') patch_server_token = get_settings().get( 'gerrit.patch_server_token') response = requests.post( patch_server_endpoint, json={ "content": patch, "path": path, }, headers={ "Content-Type": "application/json", "Authorization": f"Bearer {patch_server_token}", } ) response.raise_for_status() patch_server_endpoint = patch_server_endpoint.rstrip("/") return patch_server_endpoint + "/" + path class GerritProvider(GitProvider): def __init__(self, key: str, incremental=False): self.project, self.refspec = key.split(':') assert self.project, "Project name is required" assert self.refspec, "Refspec is required" base_url = get_settings().get('gerrit.url') assert base_url, "Gerrit URL is required" user = get_settings().get('gerrit.user') assert user, "Gerrit user is required" parsed = urllib3.util.parse_url(base_url) self.parsed_url = urllib3.util.parse_url( f"{parsed.scheme}://{user}@{parsed.host}:{parsed.port}" ) self.repo_path = prepare_repo( self.parsed_url, self.project, self.refspec ) self.repo = Repo(self.repo_path) assert self.repo self.pr_url = base_url self.pr = PullRequestMimic(self.get_pr_title(), self.get_diff_files()) def get_pr_title(self): """ Substitutes the branch-name as the PR-mimic title. """ return self.repo.branches[0].name def get_issue_comments(self): comments = list_comments(self.parsed_url, self.refspec) Comments = namedtuple('Comments', ['reversed']) Comment = namedtuple('Comment', ['body']) return Comments([Comment(c['message']) for c in reversed(comments)]) def get_pr_labels(self, update=False): raise NotImplementedError( 'Getting labels is not implemented for the gerrit provider') def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False): raise NotImplementedError( 'Adding reactions is not implemented for the gerrit provider') def remove_reaction(self, issue_comment_id: int, reaction_id: int): raise NotImplementedError( 'Removing reactions is not implemented for the gerrit provider') def get_commit_messages(self): return [self.repo.head.commit.message] def get_repo_settings(self): try: with open(self.repo_path / ".pr_agent.toml", 'rb') as f: contents = f.read() return contents except OSError: return b"" def get_diff_files(self) -> list[FilePatchInfo]: diffs = self.repo.head.commit.diff( self.repo.head.commit.parents[0], # previous commit create_patch=True, R=True ) diff_files = [] for diff_item in diffs: if diff_item.a_blob is not None: original_file_content_str = ( diff_item.a_blob.data_stream.read().decode('utf-8') ) else: original_file_content_str = "" # empty file if diff_item.b_blob is not None: new_file_content_str = diff_item.b_blob.data_stream.read(). \ decode('utf-8') else: new_file_content_str = "" # empty file edit_type = EDIT_TYPE.MODIFIED if diff_item.new_file: edit_type = EDIT_TYPE.ADDED elif diff_item.deleted_file: edit_type = EDIT_TYPE.DELETED elif diff_item.renamed_file: edit_type = EDIT_TYPE.RENAMED diff_files.append( FilePatchInfo( original_file_content_str, new_file_content_str, diff_item.diff.decode('utf-8'), diff_item.b_path, edit_type=edit_type, old_filename=None if diff_item.a_path == diff_item.b_path else diff_item.a_path ) ) self.diff_files = diff_files return diff_files def get_files(self): diff_index = self.repo.head.commit.diff( self.repo.head.commit.parents[0], # previous commit R=True ) # Get the list of changed files diff_files = [item.a_path for item in diff_index] return diff_files def get_languages(self): """ Calculate percentage of languages in repository. Used for hunk prioritisation. """ # Get all files in repository filepaths = [Path(item.path) for item in self.repo.tree().traverse() if item.type == 'blob'] # Identify language by file extension and count lang_count = Counter( ext.lstrip('.') for filepath in filepaths for ext in [filepath.suffix.lower()]) # Convert counts to percentages total_files = len(filepaths) lang_percentage = {lang: count / total_files * 100 for lang, count in lang_count.items()} return lang_percentage def get_pr_description_full(self): return self.repo.head.commit.message def get_user_id(self): return self.repo.head.commit.author.email def is_supported(self, capability: str) -> bool: if capability in [ # 'get_issue_comments', 'create_inline_comment', 'publish_inline_comments', 'get_labels', 'gfm_markdown' ]: return False return True def split_suggestion(self, msg) -> tuple[str, str]: is_code_context = False description = [] context = [] for line in msg.splitlines(): if line.startswith('```suggestion'): is_code_context = True continue if line.startswith('```'): is_code_context = False continue if is_code_context: context.append(line) else: description.append( line.replace('*', '') ) return ( '\n'.join(description), '\n'.join(context) + '\n' if context else '' ) def publish_code_suggestions(self, code_suggestions: list): msg = [] for suggestion in code_suggestions: description, code = self.split_suggestion(suggestion['body']) add_suggestion( pathlib.Path(self.repo_path) / suggestion["relevant_file"], code, suggestion["relevant_lines_start"], suggestion["relevant_lines_end"], ) patch = diff(cwd=self.repo_path) patch_id = uuid.uuid4().hex[0:4] path = "/".join(["codium-ai", self.refspec, patch_id]) full_path = upload_patch(patch, path) reset_local_changes(self.repo_path) msg.append(f'* {description}\n{full_path}') if msg: add_comment(self.parsed_url, self.refspec, "\n".join(msg)) return True def publish_comment(self, pr_comment: str, is_temporary: bool = False): if not is_temporary: msg = adopt_to_gerrit_message(pr_comment) add_comment(self.parsed_url, self.refspec, msg) def publish_description(self, pr_title: str, pr_body: str): msg = adopt_to_gerrit_message(pr_body) add_comment(self.parsed_url, self.refspec, pr_title + '\n' + msg) def publish_inline_comments(self, comments: list[dict]): raise NotImplementedError( 'Publishing inline comments is not implemented for the gerrit ' 'provider') def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): raise NotImplementedError( 'Publishing inline comments is not implemented for the gerrit ' 'provider') def publish_labels(self, labels): # Not applicable to the local git provider, # but required by the interface pass def remove_initial_comment(self): # remove repo, cloned in previous steps # shutil.rmtree(self.repo_path) pass def remove_comment(self, comment): pass def get_pr_branch(self): return self.repo.head ================================================ FILE: pr_agent/git_providers/git_provider.py ================================================ from abc import ABC, abstractmethod # enum EDIT_TYPE (ADDED, DELETED, MODIFIED, RENAMED) import os import shutil import subprocess from typing import Optional, Tuple from pr_agent.algo.types import FilePatchInfo from pr_agent.algo.utils import Range, process_description from pr_agent.config_loader import get_settings from pr_agent.log import get_logger MAX_FILES_ALLOWED_FULL = 50 def get_git_ssl_env() -> dict[str, str]: """ Get git SSL configuration arguments for per-command use. This fixes SSL certificate issues when cloning repos with self-signed certificates. Returns the current environment with the addition of SSL config changes if any such SSL certificates exist. """ ssl_cert_file = os.environ.get('SSL_CERT_FILE') requests_ca_bundle = os.environ.get('REQUESTS_CA_BUNDLE') git_ssl_ca_info = os.environ.get('GIT_SSL_CAINFO') chosen_cert_file = "" # Try SSL_CERT_FILE first if ssl_cert_file: if os.path.exists(ssl_cert_file): if ((requests_ca_bundle and requests_ca_bundle != ssl_cert_file) or (git_ssl_ca_info and git_ssl_ca_info != ssl_cert_file)): get_logger().warning(f"Found mismatch among: SSL_CERT_FILE, REQUESTS_CA_BUNDLE, GIT_SSL_CAINFO. " f"Using the SSL_CERT_FILE to resolve ambiguity.", artifact={"ssl_cert_file": ssl_cert_file, "requests_ca_bundle": requests_ca_bundle, 'git_ssl_ca_info': git_ssl_ca_info}) else: get_logger().info(f"Using SSL certificate bundle for git operations", artifact={"ssl_cert_file": ssl_cert_file}) chosen_cert_file = ssl_cert_file else: get_logger().warning("SSL certificate bundle not found for git operations", artifact={"ssl_cert_file": ssl_cert_file}) # Fallback to REQUESTS_CA_BUNDLE elif requests_ca_bundle: if os.path.exists(requests_ca_bundle): if (git_ssl_ca_info and git_ssl_ca_info != requests_ca_bundle): get_logger().warning(f"Found mismatch between: REQUESTS_CA_BUNDLE, GIT_SSL_CAINFO. " f"Using the REQUESTS_CA_BUNDLE to resolve ambiguity.", artifact = {"requests_ca_bundle": requests_ca_bundle, 'git_ssl_ca_info': git_ssl_ca_info}) else: get_logger().info("Using SSL certificate bundle from REQUESTS_CA_BUNDLE for git operations", artifact={"requests_ca_bundle": requests_ca_bundle}) chosen_cert_file = requests_ca_bundle else: get_logger().warning("requests CA bundle not found for git operations", artifact={"requests_ca_bundle": requests_ca_bundle}) #Fallback to GIT CA: elif git_ssl_ca_info: if os.path.exists(git_ssl_ca_info): get_logger().info("Using git SSL CA info from GIT_SSL_CAINFO for git operations", artifact={"git_ssl_ca_info": git_ssl_ca_info}) chosen_cert_file = git_ssl_ca_info else: get_logger().warning("git SSL CA info not found for git operations", artifact={"git_ssl_ca_info": git_ssl_ca_info}) else: get_logger().warning("Neither SSL_CERT_FILE nor REQUESTS_CA_BUNDLE nor GIT_SSL_CAINFO are defined, or they are defined but not found. Returning environment without SSL configuration") returned_env = os.environ.copy() if chosen_cert_file: returned_env.update({"GIT_SSL_CAINFO": chosen_cert_file, "REQUESTS_CA_BUNDLE": chosen_cert_file}) return returned_env class GitProvider(ABC): @abstractmethod def is_supported(self, capability: str) -> bool: pass #Given a url (issues or PR/MR) - get the .git repo url to which they belong. Needs to be implemented by the provider. def get_git_repo_url(self, issues_or_pr_url: str) -> str: get_logger().warning("Not implemented! Returning empty url") return "" # Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo. Needs to be implemented by the provider. # For example: For a git: https://git_provider.com/MY_PROJECT/MY_REPO.git and desired branch: then it should return ('https://git_provider.com/projects/MY_PROJECT/repos/MY_REPO/.../', '?=') # so that to properly view the file: docs/readme.md -> /docs/readme.md -> https://git_provider.com/projects/MY_PROJECT/repos/MY_REPO//docs/readme.md?=) def get_canonical_url_parts(self, repo_git_url:str, desired_branch:str) -> Tuple[str, str]: get_logger().warning("Not implemented! Returning empty prefix and suffix") return ("", "") #Clone related API #An object which ensures deletion of a cloned repo, once it becomes out of scope. # Example usage: # with TemporaryDirectory() as tmp_dir: # returned_obj: GitProvider.ScopedClonedRepo = self.git_provider.clone(self.repo_url, tmp_dir, remove_dest_folder=False) # print(returned_obj.path) #Use returned_obj.path. # #From this point, returned_obj.path may be deleted at any point and therefore must not be used. class ScopedClonedRepo(object): def __init__(self, dest_folder): self.path = dest_folder def __del__(self): if self.path and os.path.exists(self.path): shutil.rmtree(self.path, ignore_errors=True) #Method to allow implementors to manipulate the repo url to clone (such as embedding tokens in the url string). Needs to be implemented by the provider. def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None: get_logger().warning("Not implemented! Returning None") return None # Does a shallow clone, using a forked process to support a timeout guard. # In case operation has failed, it is expected to throw an exception as this method does not return a value. def _clone_inner(self, repo_url: str, dest_folder: str, operation_timeout_in_seconds: int=None) -> None: #The following ought to be equivalent to: # #Repo.clone_from(repo_url, dest_folder) # , but with throwing an exception upon timeout. # Note: This can only be used in context that supports using pipes. try: ssl_env = get_git_ssl_env() except Exception as e: get_logger().exception( "Failed to prepare SSL environment for git operations, falling back to default env", artifact={"error": e} ) ssl_env = os.environ.copy() subprocess.run([ "git", "clone", "--filter=blob:none", "--depth", "1", repo_url, dest_folder ], env=ssl_env, check=True, # check=True will raise an exception if the command fails stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=operation_timeout_in_seconds) CLONE_TIMEOUT_SEC = 20 # Clone a given url to a destination folder. If successful, returns an object that wraps the destination folder, # deleting it once it is garbage collected. See: GitProvider.ScopedClonedRepo for more details. def clone(self, repo_url_to_clone: str, dest_folder: str, remove_dest_folder: bool = True, operation_timeout_in_seconds: int=CLONE_TIMEOUT_SEC) -> ScopedClonedRepo|None: returned_obj = None clone_url = self._prepare_clone_url_with_token(repo_url_to_clone) if not clone_url: get_logger().error("Clone failed: Unable to obtain url to clone.") return returned_obj try: if remove_dest_folder and os.path.exists(dest_folder) and os.path.isdir(dest_folder): shutil.rmtree(dest_folder) self._clone_inner(clone_url, dest_folder, operation_timeout_in_seconds) returned_obj = GitProvider.ScopedClonedRepo(dest_folder) except Exception as e: get_logger().exception(f"Clone failed: Could not clone url.", artifact={"error": str(e), "url": clone_url, "dest_folder": dest_folder}) finally: return returned_obj @abstractmethod def get_files(self) -> list: pass @abstractmethod def get_diff_files(self) -> list[FilePatchInfo]: pass def get_incremental_commits(self, is_incremental): pass @abstractmethod def publish_description(self, pr_title: str, pr_body: str): pass @abstractmethod def publish_code_suggestions(self, code_suggestions: list) -> bool: pass @abstractmethod def get_languages(self): pass @abstractmethod def get_pr_branch(self): pass @abstractmethod def get_user_id(self): pass @abstractmethod def get_pr_description_full(self) -> str: pass def edit_comment(self, comment, body: str): pass def edit_comment_from_comment_id(self, comment_id: int, body: str): pass def get_comment_body_from_comment_id(self, comment_id: int) -> str: pass def reply_to_comment_from_comment_id(self, comment_id: int, body: str): pass def get_pr_description(self, full: bool = True, split_changes_walkthrough=False) -> str | tuple: from pr_agent.algo.utils import clip_tokens from pr_agent.config_loader import get_settings max_tokens_description = get_settings().get("CONFIG.MAX_DESCRIPTION_TOKENS", None) description = self.get_pr_description_full() if full else self.get_user_description() if split_changes_walkthrough: description, files = process_description(description) if max_tokens_description: description = clip_tokens(description, max_tokens_description) return description, files else: if max_tokens_description: description = clip_tokens(description, max_tokens_description) return description def get_user_description(self) -> str: if hasattr(self, 'user_description') and not (self.user_description is None): return self.user_description description = (self.get_pr_description_full() or "").strip() description_lowercase = description.lower() get_logger().debug(f"Existing description", description=description_lowercase) # if the existing description wasn't generated by the pr-agent, just return it as-is if not self._is_generated_by_pr_agent(description_lowercase): get_logger().info(f"Existing description was not generated by the pr-agent") self.user_description = description return description # if the existing description was generated by the pr-agent, but it doesn't contain a user description, # return nothing (empty string) because it means there is no user description user_description_header = "### **user description**" if user_description_header not in description_lowercase: get_logger().info(f"Existing description was generated by the pr-agent, but it doesn't contain a user description") return "" # otherwise, extract the original user description from the existing pr-agent description and return it # user_description_start_position = description_lowercase.find(user_description_header) + len(user_description_header) # return description[user_description_start_position:].split("\n", 1)[-1].strip() # the 'user description' is in the beginning. extract and return it possible_headers = self._possible_headers() start_position = description_lowercase.find(user_description_header) + len(user_description_header) end_position = len(description) for header in possible_headers: # try to clip at the next header if header != user_description_header and header in description_lowercase: end_position = min(end_position, description_lowercase.find(header)) if end_position != len(description) and end_position > start_position: original_user_description = description[start_position:end_position].strip() if original_user_description.endswith("___"): original_user_description = original_user_description[:-3].strip() else: original_user_description = description.split("___")[0].strip() if original_user_description.lower().startswith(user_description_header): original_user_description = original_user_description[len(user_description_header):].strip() get_logger().info(f"Extracted user description from existing description", description=original_user_description) self.user_description = original_user_description return original_user_description def _possible_headers(self): return ("### **user description**", "### **pr type**", "### **pr description**", "### **pr labels**", "### **type**", "### **description**", "### **labels**", "### 🤖 generated by pr agent") def _is_generated_by_pr_agent(self, description_lowercase: str) -> bool: possible_headers = self._possible_headers() return any(description_lowercase.startswith(header) for header in possible_headers) @abstractmethod def get_repo_settings(self): pass def get_workspace_name(self): return "" def get_pr_id(self): return "" def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: return "" def get_lines_link_original_file(self, filepath:str, component_range: Range) -> str: return "" #### comments operations #### @abstractmethod def publish_comment(self, pr_comment: str, is_temporary: bool = False): pass def publish_persistent_comment(self, pr_comment: str, initial_header: str, update_header: bool = True, name='review', final_update_message=True): return self.publish_comment(pr_comment) def publish_persistent_comment_full(self, pr_comment: str, initial_header: str, update_header: bool = True, name='review', final_update_message=True): try: prev_comments = list(self.get_issue_comments()) for comment in prev_comments: if comment.body.startswith(initial_header): latest_commit_url = self.get_latest_commit_url() comment_url = self.get_comment_url(comment) if update_header: updated_header = f"{initial_header}\n\n#### ({name.capitalize()} updated until commit {latest_commit_url})\n" pr_comment_updated = pr_comment.replace(initial_header, updated_header) else: pr_comment_updated = pr_comment get_logger().info(f"Persistent mode - updating comment {comment_url} to latest {name} message") # response = self.mr.notes.update(comment.id, {'body': pr_comment_updated}) self.edit_comment(comment, pr_comment_updated) if final_update_message: return self.publish_comment( f"**[Persistent {name}]({comment_url})** updated to latest commit {latest_commit_url}") return comment except Exception as e: get_logger().exception(f"Failed to update persistent review, error: {e}") pass return self.publish_comment(pr_comment) @abstractmethod def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): pass def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, absolute_position: int = None): raise NotImplementedError("This git provider does not support creating inline comments yet") @abstractmethod def publish_inline_comments(self, comments: list[dict]): pass @abstractmethod def remove_initial_comment(self): pass @abstractmethod def remove_comment(self, comment): pass @abstractmethod def get_issue_comments(self): pass def get_comment_url(self, comment) -> str: return "" def get_review_thread_comments(self, comment_id: int) -> list[dict]: pass #### labels operations #### @abstractmethod def publish_labels(self, labels): pass @abstractmethod def get_pr_labels(self, update=False): pass def get_repo_labels(self): pass @abstractmethod def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: pass @abstractmethod def remove_reaction(self, issue_comment_id: int, reaction_id: int) -> bool: pass #### commits operations #### @abstractmethod def get_commit_messages(self): pass def get_pr_url(self) -> str: if hasattr(self, 'pr_url'): return self.pr_url return "" def get_latest_commit_url(self) -> str: return "" def auto_approve(self) -> bool: return False def calc_pr_statistics(self, pull_request_data: dict): return {} def get_num_of_files(self): try: return len(self.get_diff_files()) except Exception as e: return -1 def limit_output_characters(self, output: str, max_chars: int): return output[:max_chars] + '...' if len(output) > max_chars else output def get_main_pr_language(languages, files) -> str: """ Get the main language of the commit. Return an empty string if cannot determine. """ main_language_str = "" if not languages: get_logger().info("No languages detected") return main_language_str if not files: get_logger().info("No files in diff") return main_language_str try: top_language = max(languages, key=languages.get).lower() # validate that the specific commit uses the main language extension_list = [] for file in files: if not file: continue if isinstance(file, str): file = FilePatchInfo(base_file=None, head_file=None, patch=None, filename=file) extension_list.append(file.filename.rsplit('.')[-1]) # get the most common extension most_common_extension = '.' + max(set(extension_list), key=extension_list.count) try: language_extension_map_org = get_settings().language_extension_map_org language_extension_map = {k.lower(): v for k, v in language_extension_map_org.items()} if top_language in language_extension_map and most_common_extension in language_extension_map[top_language]: main_language_str = top_language else: for language, extensions in language_extension_map.items(): if most_common_extension in extensions: main_language_str = language break except Exception as e: get_logger().exception(f"Failed to get main language: {e}") ## old approach: # most_common_extension = max(set(extension_list), key=extension_list.count) # if most_common_extension == 'py' and top_language == 'python' or \ # most_common_extension == 'js' and top_language == 'javascript' or \ # most_common_extension == 'ts' and top_language == 'typescript' or \ # most_common_extension == 'tsx' and top_language == 'typescript' or \ # most_common_extension == 'go' and top_language == 'go' or \ # most_common_extension == 'java' and top_language == 'java' or \ # most_common_extension == 'c' and top_language == 'c' or \ # most_common_extension == 'cpp' and top_language == 'c++' or \ # most_common_extension == 'cs' and top_language == 'c#' or \ # most_common_extension == 'swift' and top_language == 'swift' or \ # most_common_extension == 'php' and top_language == 'php' or \ # most_common_extension == 'rb' and top_language == 'ruby' or \ # most_common_extension == 'rs' and top_language == 'rust' or \ # most_common_extension == 'scala' and top_language == 'scala' or \ # most_common_extension == 'kt' and top_language == 'kotlin' or \ # most_common_extension == 'pl' and top_language == 'perl' or \ # most_common_extension == top_language: # main_language_str = top_language except Exception as e: get_logger().exception(e) return main_language_str class IncrementalPR: def __init__(self, is_incremental: bool = False): self.is_incremental = is_incremental self.commits_range = None self.first_new_commit = None self.last_seen_commit = None @property def first_new_commit_sha(self): return None if self.first_new_commit is None else self.first_new_commit.sha @property def last_seen_commit_sha(self): return None if self.last_seen_commit is None else self.last_seen_commit.sha ================================================ FILE: pr_agent/git_providers/gitea_provider.py ================================================ import json from typing import Any, Dict, List, Optional, Set, Tuple from urllib.parse import urlparse import giteapy from giteapy.rest import ApiException from pr_agent.algo.file_filter import filter_ignored from pr_agent.algo.language_handler import is_valid_file from pr_agent.algo.types import EDIT_TYPE from pr_agent.algo.utils import (clip_tokens, find_line_number_of_relevant_line_in_file) from pr_agent.config_loader import get_settings from pr_agent.git_providers.git_provider import (MAX_FILES_ALLOWED_FULL, FilePatchInfo, GitProvider, IncrementalPR) from pr_agent.log import get_logger class GiteaProvider(GitProvider): def __init__(self, url: Optional[str] = None): super().__init__() self.logger = get_logger() if not url: self.logger.error("PR URL not provided.") raise ValueError("PR URL not provided.") self.base_url = get_settings().get("GITEA.URL", "https://gitea.com").rstrip("/") self.pr_url = "" self.issue_url = "" self.gitea_access_token = get_settings().get("GITEA.PERSONAL_ACCESS_TOKEN", None) if not self.gitea_access_token: self.logger.error("Gitea access token not found in settings.") raise ValueError("Gitea access token not found in settings.") self.repo_settings = get_settings().get("GITEA.REPO_SETTING", None) configuration = giteapy.Configuration() configuration.host = "{}/api/v1".format(self.base_url) configuration.api_key['Authorization'] = f'token {self.gitea_access_token}' if get_settings().get("GITEA.SKIP_SSL_VERIFICATION", False): configuration.verify_ssl = False # Use custom cert (self-signed) configuration.ssl_ca_cert = get_settings().get("GITEA.SSL_CA_CERT", None) client = giteapy.ApiClient(configuration) self.repo_api = RepoApi(client) self.owner = None self.repo = None self.pr_number = None self.issue_number = None self.max_comment_chars = 65000 self.enabled_pr = False self.enabled_issue = False self.temp_comments = [] self.pr = None self.git_files = [] self.file_contents = {} self.file_diffs = {} self.sha = None self.diff_files = [] self.incremental = IncrementalPR(False) self.comments_list = [] self.unreviewed_files_set = dict() if "pulls" in url: self.pr_url = url self.__set_repo_and_owner_from_pr() self.enabled_pr = True self.pr = self.repo_api.get_pull_request( owner=self.owner, repo=self.repo, pr_number=self.pr_number ) self.git_files = self.repo_api.get_change_file_pull_request( owner=self.owner, repo=self.repo, pr_number=self.pr_number ) # Optional ignore with user custom self.git_files = filter_ignored(self.git_files, platform="gitea") self.sha = self.pr.head.sha if self.pr.head.sha else "" self.__add_file_content() self.__add_file_diff() self.pr_commits = self.repo_api.list_all_commits( owner=self.owner, repo=self.repo ) self.last_commit = self.pr_commits[-1] self.last_commit_id = self.last_commit self.base_sha = self.pr.base.sha if self.pr.base.sha else "" self.base_ref = self.pr.base.ref if self.pr.base.ref else "" elif "issues" in url: self.issue_url = url self.__set_repo_and_owner_from_issue() self.enabled_issue = True else: self.pr_commits = None def __add_file_content(self): for file in self.git_files: file_path = file.get("filename") # Ignore file from default settings if not is_valid_file(file_path): continue if file_path and self.sha: try: content = self.repo_api.get_file_content( owner=self.owner, repo=self.repo, commit_sha=self.sha, filepath=file_path ) self.file_contents[file_path] = content except ApiException as e: self.logger.error(f"Error getting file content for {file_path}: {str(e)}") self.file_contents[file_path] = "" def __add_file_diff(self): try: diff_contents = self.repo_api.get_pull_request_diff( owner=self.owner, repo=self.repo, pr_number=self.pr_number ) lines = diff_contents.splitlines() current_file = None current_patch = [] file_patches = {} for line in lines: if line.startswith('diff --git'): if current_file and current_patch: file_patches[current_file] = '\n'.join(current_patch) current_patch = [] current_file = line.split(' b/')[-1] elif line.startswith('@@'): current_patch = [line] elif current_patch: current_patch.append(line) if current_file and current_patch: file_patches[current_file] = '\n'.join(current_patch) self.file_diffs = file_patches except Exception as e: self.logger.error(f"Error getting diff content: {str(e)}") def _parse_pr_url(self, pr_url: str) -> Tuple[str, str, int]: parsed_url = urlparse(pr_url) if parsed_url.path.startswith('/api/v1'): parsed_url = urlparse(pr_url.replace("/api/v1", "")) path_parts = parsed_url.path.strip('/').split('/') if len(path_parts) < 4 or path_parts[2] != 'pulls': raise ValueError("The provided URL does not appear to be a Gitea PR URL") try: pr_number = int(path_parts[3]) except ValueError as e: raise ValueError("Unable to convert PR number to integer") from e owner = path_parts[0] repo = path_parts[1] return owner, repo, pr_number def _parse_issue_url(self, issue_url: str) -> Tuple[str, str, int]: parsed_url = urlparse(issue_url) if parsed_url.path.startswith('/api/v1'): parsed_url = urlparse(issue_url.replace("/api/v1", "")) path_parts = parsed_url.path.strip('/').split('/') if len(path_parts) < 4 or path_parts[2] != 'issues': raise ValueError("The provided URL does not appear to be a Gitea issue URL") try: issue_number = int(path_parts[3]) except ValueError as e: raise ValueError("Unable to convert issue number to integer") from e owner = path_parts[0] repo = path_parts[1] return owner, repo, issue_number def __set_repo_and_owner_from_pr(self): """Extract owner and repo from the PR URL""" try: owner, repo, pr_number = self._parse_pr_url(self.pr_url) self.owner = owner self.repo = repo self.pr_number = pr_number self.logger.info(f"Owner: {self.owner}, Repo: {self.repo}, PR Number: {self.pr_number}") except ValueError as e: self.logger.error(f"Error parsing PR URL: {str(e)}") except Exception as e: self.logger.error(f"Unexpected error: {str(e)}") def __set_repo_and_owner_from_issue(self): """Extract owner and repo from the issue URL""" try: owner, repo, issue_number = self._parse_issue_url(self.issue_url) self.owner = owner self.repo = repo self.issue_number = issue_number self.logger.info(f"Owner: {self.owner}, Repo: {self.repo}, Issue Number: {self.issue_number}") except ValueError as e: self.logger.error(f"Error parsing issue URL: {str(e)}") except Exception as e: self.logger.error(f"Unexpected error: {str(e)}") def get_pr_url(self) -> str: return self.pr_url def get_issue_url(self) -> str: return self.issue_url def get_latest_commit_url(self) -> str: return self.last_commit.html_url def get_comment_url(self, comment) -> str: return comment.html_url def publish_persistent_comment(self, pr_comment: str, initial_header: str, update_header: bool = True, name='review', final_update_message=True): self.publish_persistent_comment_full(pr_comment, initial_header, update_header, name, final_update_message) def publish_comment(self, comment: str,is_temporary: bool = False) -> None: """Publish a comment to the pull request""" if is_temporary and not get_settings().config.publish_output_progress: get_logger().debug(f"Skipping publish_comment for temporary comment") return None if self.enabled_issue: index = self.issue_number elif self.enabled_pr: index = self.pr_number else: self.logger.error("Neither PR nor issue URL provided.") return None comment = self.limit_output_characters(comment, self.max_comment_chars) response = self.repo_api.create_comment( owner=self.owner, repo=self.repo, index=index, comment=comment ) if not response: self.logger.error("Failed to publish comment") return None if is_temporary: self.temp_comments.append(comment) comment_obj = { "is_temporary": is_temporary, "comment": comment, "comment_id": response.id if isinstance(response, tuple) else response.id } self.comments_list.append(comment_obj) self.logger.info("Comment published") return comment_obj def edit_comment(self, comment, body : str): body = self.limit_output_characters(body, self.max_comment_chars) try: self.repo_api.edit_comment( owner=self.owner, repo=self.repo, comment_id=comment.get("comment_id") if isinstance(comment, dict) else comment.id, comment=body ) except ApiException as e: self.logger.error(f"Error editing comment: {e}") return None except Exception as e: self.logger.error(f"Unexpected error: {e}") return None def publish_inline_comment(self,body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): """Publish an inline comment on a specific line""" body = self.limit_output_characters(body, self.max_comment_chars) position, absolute_position = find_line_number_of_relevant_line_in_file(self.diff_files, relevant_file.strip('`'), relevant_line_in_file, ) if position == -1: get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") subject_type = "FILE" else: subject_type = "LINE" path = relevant_file.strip() payload = dict(body=body, path=path, old_position=position,new_position = absolute_position) if subject_type == "LINE" else {} self.publish_inline_comments([payload]) def publish_inline_comments(self, comments: List[Dict[str, Any]],body : str = "Inline comment") -> None: response = self.repo_api.create_inline_comment( owner=self.owner, repo=self.repo, pr_number=self.pr_number if self.enabled_pr else self.issue_number, body=body, commit_id=self.last_commit.sha if self.last_commit else "", comments=comments ) if not response: self.logger.error("Failed to publish inline comment") return self.logger.info("Inline comment published") def publish_code_suggestions(self, suggestions: List[Dict[str, Any]]): """Publish code suggestions""" for suggestion in suggestions: body = suggestion.get("body","") if not body: self.logger.error("No body provided for the suggestion") continue path = suggestion.get("relevant_file","") new_position = suggestion.get("relevant_lines_start",0) old_position = suggestion.get("relevant_lines_start",0) if "original_suggestion" not in suggestion else suggestion["original_suggestion"].get("relevant_lines_start",0) title_body = suggestion["original_suggestion"].get("suggestion_content","") if "original_suggestion" in suggestion else "" payload = dict(body=body, path=path, old_position=old_position,new_position = new_position) if title_body: title_body = f"**Suggestion:** {title_body}" self.publish_inline_comments([payload],title_body) else: self.publish_inline_comments([payload]) def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: """Add eyes reaction to a comment""" try: if disable_eyes: return None comments = self.repo_api.list_all_comments( owner=self.owner, repo=self.repo, index=self.pr_number if self.enabled_pr else self.issue_number ) comment_ids = [comment.id for comment in comments] if issue_comment_id not in comment_ids: self.logger.error(f"Comment ID {issue_comment_id} not found. Available IDs: {comment_ids}") return None response = self.repo_api.add_reaction_comment( owner=self.owner, repo=self.repo, comment_id=issue_comment_id, reaction="eyes" ) if not response: self.logger.error("Failed to add eyes reaction") return None return response[0].id if isinstance(response, tuple) else response.id except ApiException as e: self.logger.error(f"Error adding eyes reaction: {e}") return None except Exception as e: self.logger.error(f"Unexpected error: {e}") return None def remove_reaction(self, comment_id: int) -> None: """Remove reaction from a comment""" try: response = self.repo_api.remove_reaction_comment( owner=self.owner, repo=self.repo, comment_id=comment_id ) if not response: self.logger.error("Failed to remove reaction") except ApiException as e: self.logger.error(f"Error removing reaction: {e}") except Exception as e: self.logger.error(f"Unexpected error: {e}") def get_commit_messages(self)-> str: """Get commit messages for the PR""" max_tokens = get_settings().get("CONFIG.MAX_COMMITS_TOKENS", None) pr_commits = self.repo_api.get_pr_commits( owner=self.owner, repo=self.repo, pr_number=self.pr_number ) if not pr_commits: self.logger.error("Failed to get commit messages") return "" try: commit_messages = [commit["commit"]["message"] for commit in pr_commits if commit] if not commit_messages: self.logger.error("No commit messages found") return "" commit_message = "".join(commit_messages) if max_tokens: commit_message = clip_tokens(commit_message, max_tokens) return commit_message except Exception as e: self.logger.error(f"Error processing commit messages: {str(e)}") return "" def _get_file_content_from_base(self, filename: str) -> str: return self.repo_api.get_file_content( owner=self.owner, repo=self.repo, commit_sha=self.base_sha, filepath=filename ) def _get_file_content_from_latest_commit(self, filename: str) -> str: return self.repo_api.get_file_content( owner=self.owner, repo=self.repo, commit_sha=self.last_commit.sha, filepath=filename ) def get_diff_files(self) -> List[FilePatchInfo]: """Get files that were modified in the PR""" if self.diff_files: return self.diff_files invalid_files_names = [] counter_valid = 0 diff_files = [] for file in self.git_files: filename = file.get("filename") if not filename: continue if not is_valid_file(filename): invalid_files_names.append(filename) continue counter_valid += 1 avoid_load = False patch = self.file_diffs.get(filename,"") head_file = "" base_file = "" if counter_valid >= MAX_FILES_ALLOWED_FULL and patch and not self.incremental.is_incremental: avoid_load = True if counter_valid == MAX_FILES_ALLOWED_FULL: self.logger.info("Too many files in PR, will avoid loading full content for rest of files") if avoid_load: head_file = "" else: # Get file content from this pr head_file = self.file_contents.get(filename,"") if self.incremental.is_incremental and self.unreviewed_files_set: base_file = self._get_file_content_from_latest_commit(filename) self.unreviewed_files_set[filename] = patch else: if avoid_load: base_file = "" else: base_file = self._get_file_content_from_base(filename) num_plus_lines = file.get("additions",0) num_minus_lines = file.get("deletions",0) status = file.get("status","") if status == 'added': edit_type = EDIT_TYPE.ADDED elif status == 'removed' or status == 'deleted': edit_type = EDIT_TYPE.DELETED elif status == 'renamed': edit_type = EDIT_TYPE.RENAMED elif status == 'modified' or status == 'changed': edit_type = EDIT_TYPE.MODIFIED else: self.logger.error(f"Unknown edit type: {status}") edit_type = EDIT_TYPE.UNKNOWN file_patch_info = FilePatchInfo( base_file=base_file, head_file=head_file, patch=patch, filename=filename, num_minus_lines=num_minus_lines, num_plus_lines=num_plus_lines, edit_type=edit_type ) diff_files.append(file_patch_info) if invalid_files_names: self.logger.info(f"Filtered out files with invalid extensions: {invalid_files_names}") self.diff_files = diff_files return diff_files def get_line_link(self, relevant_file, relevant_line_start, relevant_line_end = None) -> str: if relevant_line_start == -1: link = f"{self.base_url}/{self.owner}/{self.repo}/src/branch/{self.get_pr_branch()}/{relevant_file}" elif relevant_line_end: link = f"{self.base_url}/{self.owner}/{self.repo}/src/branch/{self.get_pr_branch()}/{relevant_file}#L{relevant_line_start}-L{relevant_line_end}" else: link = f"{self.base_url}/{self.owner}/{self.repo}/src/branch/{self.get_pr_branch()}/{relevant_file}#L{relevant_line_start}" self.logger.info(f"Generated link: {link}") return link def get_pr_id(self): try: pr_id = f"{self.repo}/{self.pr_number}" return pr_id except: return "" def get_files(self) -> List[Dict[str, Any]]: """Get all files in the PR""" return [file.get("filename","") for file in self.git_files] def get_num_of_files(self) -> int: """Get number of files changed in the PR""" return len(self.git_files) def get_issue_comments(self) -> List[Dict[str, Any]]: """Get all comments in the PR""" index = self.issue_number if self.enabled_issue else self.pr_number comments = self.repo_api.list_all_comments( owner=self.owner, repo=self.repo, index=index ) if not comments: self.logger.error("Failed to get comments") return [] return comments def get_languages(self) -> Set[str]: """Get programming languages used in the repository""" languages = self.repo_api.get_languages( owner=self.owner, repo=self.repo ) return languages def get_pr_branch(self) -> str: """Get the branch name of the PR""" if not self.pr: self.logger.error("Failed to get PR branch") return "" if not self.pr.head: self.logger.error("PR head not found") return "" return self.pr.head.ref if self.pr.head.ref else "" def get_pr_description_full(self) -> str: """Get full PR description with metadata""" if not self.pr: self.logger.error("Failed to get PR description") return "" return self.pr.body if self.pr.body else "" def get_pr_labels(self,update=False) -> List[str]: """Get labels assigned to the PR""" if not update: if not self.pr.labels: self.logger.error("Failed to get PR labels") return [] return [label.name for label in self.pr.labels] labels = self.repo_api.get_issue_labels( owner=self.owner, repo=self.repo, issue_number=self.pr_number ) if not labels: self.logger.error("Failed to get PR labels") return [] return [label.name for label in labels] def get_repo_settings(self) -> str: """Get repository settings""" if not self.repo_settings: self.logger.error("Repository settings not found") return "" response = self.repo_api.get_file_content( owner=self.owner, repo=self.repo, commit_sha=self.sha, filepath=self.repo_settings ) if not response: self.logger.error("Failed to get repository settings") return "" return response def get_user_id(self) -> str: """Get the ID of the authenticated user""" return f"{self.pr.user.id}" if self.pr else "" def is_supported(self, capability) -> bool: """Check if the provider is supported""" return True def get_git_repo_url(self, issues_or_pr_url: str) -> str: return f"{self.base_url}/{self.owner}/{self.repo}.git" #base_url / /.git def publish_description(self, pr_title: str, pr_body: str) -> None: """Publish PR description""" response = self.repo_api.edit_pull_request( owner=self.owner, repo=self.repo, pr_number=self.pr_number if self.enabled_pr else self.issue_number, title=pr_title, body=pr_body ) if not response: self.logger.error("Failed to publish PR description") return None self.logger.info("PR description published successfully") if self.enabled_pr: self.pr = self.repo_api.get_pull_request( owner=self.owner, repo=self.repo, pr_number=self.pr_number ) def publish_labels(self, labels: List[int]) -> None: """Publish labels to the PR""" if not labels: self.logger.error("No labels provided to publish") return None response = self.repo_api.add_labels( owner=self.owner, repo=self.repo, issue_number=self.pr_number if self.enabled_pr else self.issue_number, labels=labels ) if response: self.logger.info("Labels added successfully") def remove_comment(self, comment) -> None: """Remove a specific comment""" if not comment: return try: comment_id = comment.get("comment_id") if isinstance(comment, dict) else comment.id if not comment_id: self.logger.error("Comment ID not found") return None self.repo_api.remove_comment( owner=self.owner, repo=self.repo, comment_id=comment_id ) if self.comments_list and comment in self.comments_list: self.comments_list.remove(comment) self.logger.info(f"Comment removed successfully: {comment}") except ApiException as e: self.logger.error(f"Error removing comment: {e}") raise e def remove_initial_comment(self) -> None: """Remove the initial comment""" for comment in self.comments_list: try: if not comment.get("is_temporary"): continue self.remove_comment(comment) except Exception as e: self.logger.error(f"Error removing comment: {e}") continue self.logger.info(f"Removed initial comment: {comment.get('comment_id')}") #Clone related def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None: #For example, to clone: #https://github.com/Codium-ai/pr-agent-pro.git #Need to embed inside the github token: #https://@github.com/Codium-ai/pr-agent-pro.git gitea_token = self.gitea_access_token gitea_base_url = self.base_url scheme = gitea_base_url.split("://")[0] scheme += "://" if not all([gitea_token, gitea_base_url]): get_logger().error("Either missing auth token or missing base url") return None base_url = gitea_base_url.split(scheme)[1] if not base_url: get_logger().error(f"Base url: {gitea_base_url} has an empty base url") return None if base_url not in repo_url_to_clone: get_logger().error(f"url to clone: {repo_url_to_clone} does not contain {base_url}") return None repo_full_name = repo_url_to_clone.split(base_url)[-1] if not repo_full_name: get_logger().error(f"url to clone: {repo_url_to_clone} is malformed") return None clone_url = scheme clone_url += f"{gitea_token}@{base_url}{repo_full_name}" return clone_url class RepoApi(giteapy.RepositoryApi): def __init__(self, client: giteapy.ApiClient): self.repository = giteapy.RepositoryApi(client) self.issue = giteapy.IssueApi(client) self.logger = get_logger() super().__init__(client) def create_inline_comment(self, owner: str, repo: str, pr_number: int, body : str ,commit_id : str, comments: List[Dict[str, Any]]): body = { "body": body, "comments": comments, "commit_id": commit_id, } return self.api_client.call_api( '/repos/{owner}/{repo}/pulls/{pr_number}/reviews', 'POST', path_params={'owner': owner, 'repo': repo, 'pr_number': pr_number}, body=body, response_type='Repository', auth_settings=['AuthorizationHeaderToken'] ) def create_comment(self, owner: str, repo: str, index: int, comment: str): body = { "body": comment } return self.issue.issue_create_comment( owner=owner, repo=repo, index=index, body=body ) def edit_comment(self, owner: str, repo: str, comment_id: int, comment: str): body = { "body": comment } return self.issue.issue_edit_comment( owner=owner, repo=repo, id=comment_id, body=body ) def remove_comment(self, owner: str, repo: str, comment_id: int): return self.issue.issue_delete_comment( owner=owner, repo=repo, id=comment_id ) def list_all_comments(self, owner: str, repo: str, index: int): return self.issue.issue_get_comments( owner=owner, repo=repo, index=index ) def get_pull_request_diff(self, owner: str, repo: str, pr_number: int) -> str: """Get the diff content of a pull request using direct API call""" try: url = f'/repos/{owner}/{repo}/pulls/{pr_number}.diff' response = self.api_client.call_api( url, 'GET', path_params={}, response_type=None, _return_http_data_only=False, _preload_content=False, auth_settings=['AuthorizationHeaderToken'] ) if hasattr(response, 'data'): raw_data = response.data.read() return raw_data.decode('utf-8') elif isinstance(response, tuple): raw_data = response[0].read() return raw_data.decode('utf-8') else: error_msg = f"Unexpected response format received from API: {type(response)}" self.logger.error(error_msg) raise RuntimeError(error_msg) except ApiException as e: self.logger.error(f"Error getting diff: {str(e)}") raise e except Exception as e: self.logger.error(f"Unexpected error: {str(e)}") raise e def get_pull_request(self, owner: str, repo: str, pr_number: int): """Get pull request details including description""" return self.repository.repo_get_pull_request( owner=owner, repo=repo, index=pr_number ) def edit_pull_request(self, owner: str, repo: str, pr_number: int,title : str, body: str): """Edit pull request description""" body = { "body": body, "title" : title } return self.repository.repo_edit_pull_request( owner=owner, repo=repo, index=pr_number, body=body ) def get_change_file_pull_request(self, owner: str, repo: str, pr_number: int): """Get changed files in the pull request""" try: url = f'/repos/{owner}/{repo}/pulls/{pr_number}/files' response = self.api_client.call_api( url, 'GET', path_params={}, response_type=None, _return_http_data_only=False, _preload_content=False, auth_settings=['AuthorizationHeaderToken'] ) if hasattr(response, 'data'): raw_data = response.data.read() diff_content = raw_data.decode('utf-8') return json.loads(diff_content) if isinstance(diff_content, str) else diff_content elif isinstance(response, tuple): raw_data = response[0].read() diff_content = raw_data.decode('utf-8') return json.loads(diff_content) if isinstance(diff_content, str) else diff_content return [] except ApiException as e: self.logger.error(f"Error getting changed files: {e}") return [] except Exception as e: self.logger.error(f"Unexpected error: {e}") return [] def get_languages(self, owner: str, repo: str): """Get programming languages used in the repository""" try: url = f'/repos/{owner}/{repo}/languages' response = self.api_client.call_api( url, 'GET', path_params={}, response_type=None, _return_http_data_only=False, _preload_content=False, auth_settings=['AuthorizationHeaderToken'] ) if hasattr(response, 'data'): raw_data = response.data.read() return json.loads(raw_data.decode('utf-8')) elif isinstance(response, tuple): raw_data = response[0].read() return json.loads(raw_data.decode('utf-8')) return {} except ApiException as e: self.logger.error(f"Error getting languages: {e}") return {} except Exception as e: self.logger.error(f"Unexpected error: {e}") return {} def get_file_content(self, owner: str, repo: str, commit_sha: str, filepath: str) -> str: """Get raw file content from a specific commit""" try: url = f'/repos/{owner}/{repo}/raw/{filepath}' query_params = [] if commit_sha: query_params.append(('ref', commit_sha)) response = self.api_client.call_api( url, 'GET', path_params={}, query_params=query_params, response_type=None, _return_http_data_only=False, _preload_content=False, auth_settings=['AuthorizationHeaderToken'] ) if hasattr(response, 'data'): raw_data = response.data.read() return raw_data.decode('utf-8') elif isinstance(response, tuple): raw_data = response[0].read() return raw_data.decode('utf-8') return "" except ApiException as e: self.logger.error(f"Error getting file: {filepath}, content: {e}") return "" except Exception as e: self.logger.error(f"Unexpected error: {e}") return "" def get_issue_labels(self, owner: str, repo: str, issue_number: int): """Get labels assigned to the issue""" return self.issue.issue_get_labels( owner=owner, repo=repo, index=issue_number ) def list_all_commits(self, owner: str, repo: str): return self.repository.repo_get_all_commits( owner=owner, repo=repo ) def add_reviewer(self, owner: str, repo: str, pr_number: int, reviewers: List[str]): body = { "reviewers": reviewers } return self.api_client.call_api( '/repos/{owner}/{repo}/pulls/{pr_number}/requested_reviewers', 'POST', path_params={'owner': owner, 'repo': repo, 'pr_number': pr_number}, body=body, response_type='Repository', auth_settings=['AuthorizationHeaderToken'] ) def add_reaction_comment(self, owner: str, repo: str, comment_id: int, reaction: str): body = { "content": reaction } return self.api_client.call_api( '/repos/{owner}/{repo}/issues/comments/{id}/reactions', 'POST', path_params={'owner': owner, 'repo': repo, 'id': comment_id}, body=body, response_type='Repository', auth_settings=['AuthorizationHeaderToken'] ) def remove_reaction_comment(self, owner: str, repo: str, comment_id: int): return self.api_client.call_api( '/repos/{owner}/{repo}/issues/comments/{id}/reactions', 'DELETE', path_params={'owner': owner, 'repo': repo, 'id': comment_id}, response_type='Repository', auth_settings=['AuthorizationHeaderToken'] ) def add_labels(self, owner: str, repo: str, issue_number: int, labels: List[int]): body = { "labels": labels } return self.issue.issue_add_label( owner=owner, repo=repo, index=issue_number, body=body ) def get_pr_commits(self, owner: str, repo: str, pr_number: int): """Get all commits in a pull request""" try: url = f'/repos/{owner}/{repo}/pulls/{pr_number}/commits' response = self.api_client.call_api( url, 'GET', path_params={}, response_type=None, _return_http_data_only=False, _preload_content=False, auth_settings=['AuthorizationHeaderToken'] ) if hasattr(response, 'data'): raw_data = response.data.read() commits_data = json.loads(raw_data.decode('utf-8')) return commits_data elif isinstance(response, tuple): raw_data = response[0].read() commits_data = json.loads(raw_data.decode('utf-8')) return commits_data return [] except ApiException as e: self.logger.error(f"Error getting PR commits: {e}") return [] except Exception as e: self.logger.error(f"Unexpected error: {e}") return [] ================================================ FILE: pr_agent/git_providers/github_provider.py ================================================ import copy import difflib import hashlib import itertools import re import time import traceback import json from datetime import datetime from typing import Optional, Tuple from urllib.parse import urlparse from github.Issue import Issue from github import AppAuthentication, Auth, Github, GithubException from retry import retry from starlette_context import context from ..algo.file_filter import filter_ignored from ..algo.git_patch_processing import extract_hunk_headers from ..algo.language_handler import is_valid_file from ..algo.types import EDIT_TYPE from ..algo.utils import (PRReviewHeader, Range, clip_tokens, find_line_number_of_relevant_line_in_file, load_large_diff, set_file_languages) from ..config_loader import get_settings from ..log import get_logger from ..servers.utils import RateLimitExceeded from .git_provider import (MAX_FILES_ALLOWED_FULL, FilePatchInfo, GitProvider, IncrementalPR) class GithubProvider(GitProvider): def __init__(self, pr_url: Optional[str] = None): self.repo_obj = None try: self.installation_id = context.get("installation_id", None) except Exception: self.installation_id = None self.max_comment_chars = 65000 self.base_url = get_settings().get("GITHUB.BASE_URL", "https://api.github.com").rstrip("/") # "https://api.github.com" self.base_url_html = self.base_url.split("api/")[0].rstrip("/") if "api/" in self.base_url else "https://github.com" self.github_client = self._get_github_client() self.repo = None self.pr_num = None self.pr = None self.issue_main = None self.github_user_id = None self.diff_files = None self.git_files = None self.incremental = IncrementalPR(False) if pr_url and 'pull' in pr_url: self.set_pr(pr_url) self.pr_commits = list(self.pr.get_commits()) self.last_commit_id = self.pr_commits[-1] self.pr_url = self.get_pr_url() # pr_url for github actions can be as api.github.com, so we need to get the url from the pr object elif pr_url and 'issue' in pr_url: #url is an issue self.issue_main = self._get_issue_handle(pr_url) else: #Instantiated the provider without a PR / Issue self.pr_commits = None def _get_issue_handle(self, issue_url) -> Optional[Issue]: repo_name, issue_number = self._parse_issue_url(issue_url) if not repo_name or not issue_number: get_logger().error(f"Given url: {issue_url} is not a valid issue.") return None # else: Check if can get a valid Repo handle: try: repo_obj = self.github_client.get_repo(repo_name) if not repo_obj: get_logger().error(f"Given url: {issue_url}, belonging to owner/repo: {repo_name} does " f"not have a valid repository: {self.get_git_repo_url(issue_url)}") return None # else: Valid repo handle: return repo_obj.get_issue(issue_number) except Exception as e: get_logger().exception(f"Failed to get an issue object for issue: {issue_url}, belonging to owner/repo: {repo_name}") return None def get_incremental_commits(self, incremental=IncrementalPR(False)): self.incremental = incremental if self.incremental.is_incremental: self.unreviewed_files_set = dict() self._get_incremental_commits() def is_supported(self, capability: str) -> bool: return True def _get_owner_and_repo_path(self, given_url: str) -> str: try: repo_path = None if 'issues' in given_url: repo_path, _ = self._parse_issue_url(given_url) elif 'pull' in given_url: repo_path, _ = self._parse_pr_url(given_url) elif given_url.endswith('.git'): parsed_url = urlparse(given_url) repo_path = (parsed_url.path.split('.git')[0])[1:] # //.git -> / if not repo_path: get_logger().error(f"url is neither an issues url nor a PR url nor a valid git url: {given_url}. Returning empty result.") return "" return repo_path except Exception as e: get_logger().exception(f"unable to parse url: {given_url}. Returning empty result.") return "" def get_git_repo_url(self, issues_or_pr_url: str) -> str: repo_path = self._get_owner_and_repo_path(issues_or_pr_url) #Return: / if not repo_path or repo_path not in issues_or_pr_url: get_logger().error(f"Unable to retrieve owner/path from url: {issues_or_pr_url}") return "" return f"{self.base_url_html}/{repo_path}.git" #https://github.com / /.git # Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo. # Example: https://github.com/qodo-ai/pr-agent.git and branch: v0.8 -> prefix: "https://github.com/qodo-ai/pr-agent/blob/v0.8", suffix: "" # In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix. def get_canonical_url_parts(self, repo_git_url:str, desired_branch:str) -> Tuple[str, str]: owner = None repo = None scheme_and_netloc = None if repo_git_url or self.issue_main: #Either user provided an external git url, which may be different than what this provider was initialized with, or an issue: desired_branch = desired_branch if repo_git_url else self.issue_main.repository.default_branch html_url = repo_git_url if repo_git_url else self.issue_main.html_url parsed_git_url = urlparse(html_url) scheme_and_netloc = parsed_git_url.scheme + "://" + parsed_git_url.netloc repo_path = self._get_owner_and_repo_path(html_url) if repo_path.count('/') == 1: #Has to have the form / owner, repo = repo_path.split('/') else: get_logger().error(f"Invalid repo_path: {repo_path} from url: {html_url}") return ("", "") if (not owner or not repo) and self.repo: #"else" - User did not provide an external git url, or not an issue, use self.repo object owner, repo = self.repo.split('/') scheme_and_netloc = self.base_url_html desired_branch = self.repo_obj.default_branch if not all([scheme_and_netloc, owner, repo]): #"else": Not invoked from a PR context,but no provided git url for context get_logger().error(f"Unable to get canonical url parts since missing context (PR or explicit git url)") return ("", "") prefix = f"{scheme_and_netloc}/{owner}/{repo}/blob/{desired_branch}" suffix = "" # github does not add a suffix return (prefix, suffix) def get_pr_url(self) -> str: return self.pr.html_url def set_pr(self, pr_url: str): self.repo, self.pr_num = self._parse_pr_url(pr_url) self.pr = self._get_pr() def _get_incremental_commits(self): if not self.pr_commits: self.pr_commits = list(self.pr.get_commits()) self.previous_review = self.get_previous_review(full=True, incremental=True) if self.previous_review: self.incremental.commits_range = self.get_commit_range() # Get all files changed during the commit range for commit in self.incremental.commits_range: if commit.commit.message.startswith(f"Merge branch '{self._get_repo().default_branch}'"): get_logger().info(f"Skipping merge commit {commit.commit.message}") continue self.unreviewed_files_set.update({file.filename: file for file in commit.files}) else: get_logger().info("No previous review found, will review the entire PR") self.incremental.is_incremental = False def get_commit_range(self): last_review_time = self.previous_review.created_at first_new_commit_index = None for index in range(len(self.pr_commits) - 1, -1, -1): if self.pr_commits[index].commit.author.date > last_review_time: self.incremental.first_new_commit = self.pr_commits[index] first_new_commit_index = index else: self.incremental.last_seen_commit = self.pr_commits[index] break return self.pr_commits[first_new_commit_index:] if first_new_commit_index is not None else [] def get_previous_review(self, *, full: bool, incremental: bool): if not (full or incremental): raise ValueError("At least one of full or incremental must be True") if not getattr(self, "comments", None): self.comments = list(self.pr.get_issue_comments()) prefixes = [] if full: prefixes.append(PRReviewHeader.REGULAR.value) if incremental: prefixes.append(PRReviewHeader.INCREMENTAL.value) for index in range(len(self.comments) - 1, -1, -1): if any(self.comments[index].body.startswith(prefix) for prefix in prefixes): return self.comments[index] def get_files(self): if self.incremental.is_incremental and self.unreviewed_files_set: return self.unreviewed_files_set.values() try: git_files = context.get("git_files", None) if git_files: return git_files self.git_files = list(self.pr.get_files()) # 'list' to handle pagination context["git_files"] = self.git_files return self.git_files except Exception: if not self.git_files: self.git_files = list(self.pr.get_files()) return self.git_files def get_num_of_files(self): if hasattr(self.git_files, "totalCount"): return self.git_files.totalCount else: try: return len(self.git_files) except Exception as e: return -1 @retry(exceptions=RateLimitExceeded, tries=get_settings().github.ratelimit_retries, delay=2, backoff=2, jitter=(1, 3)) def get_diff_files(self) -> list[FilePatchInfo]: """ Retrieves the list of files that have been modified, added, deleted, or renamed in a pull request in GitHub, along with their content and patch information. Returns: diff_files (List[FilePatchInfo]): List of FilePatchInfo objects representing the modified, added, deleted, or renamed files in the merge request. """ try: try: diff_files = context.get("diff_files", None) if diff_files: return diff_files except Exception: pass if self.diff_files: return self.diff_files # filter files using [ignore] patterns files_original = self.get_files() files = filter_ignored(files_original) if files_original != files: try: names_original = [file.filename for file in files_original] names_new = [file.filename for file in files] get_logger().info(f"Filtered out [ignore] files for pull request:", extra= {"files": names_original, "filtered_files": names_new}) except Exception: pass diff_files = [] invalid_files_names = [] is_close_to_rate_limit = False # The base.sha will point to the current state of the base branch (including parallel merges), not the original base commit when the PR was created # We can fix this by finding the merge base commit between the PR head and base branches # Note that The pr.head.sha is actually correct as is - it points to the latest commit in your PR branch. # This SHA isn't affected by parallel merges to the base branch since it's specific to your PR's branch. repo = self.repo_obj pr = self.pr try: compare = repo.compare(pr.base.sha, pr.head.sha) # communication with GitHub merge_base_commit = compare.merge_base_commit except Exception as e: get_logger().error(f"Failed to get merge base commit: {e}") merge_base_commit = pr.base if merge_base_commit.sha != pr.base.sha: get_logger().info( f"Using merge base commit {merge_base_commit.sha} instead of base commit ") counter_valid = 0 for file in files: if not is_valid_file(file.filename): invalid_files_names.append(file.filename) continue patch = file.patch if is_close_to_rate_limit: new_file_content_str = "" original_file_content_str = "" else: # allow only a limited number of files to be fully loaded. We can manage the rest with diffs only counter_valid += 1 avoid_load = False if counter_valid >= MAX_FILES_ALLOWED_FULL and patch and not self.incremental.is_incremental: avoid_load = True if counter_valid == MAX_FILES_ALLOWED_FULL: get_logger().info(f"Too many files in PR, will avoid loading full content for rest of files") if avoid_load: new_file_content_str = "" else: new_file_content_str = self._get_pr_file_content(file, self.pr.head.sha) # communication with GitHub if self.incremental.is_incremental and self.unreviewed_files_set: original_file_content_str = self._get_pr_file_content(file, self.incremental.last_seen_commit_sha) patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str) self.unreviewed_files_set[file.filename] = patch else: if avoid_load: original_file_content_str = "" else: original_file_content_str = self._get_pr_file_content(file, merge_base_commit.sha) # original_file_content_str = self._get_pr_file_content(file, self.pr.base.sha) if not patch: patch = load_large_diff(file.filename, new_file_content_str, original_file_content_str) if file.status == 'added': edit_type = EDIT_TYPE.ADDED elif file.status == 'removed': edit_type = EDIT_TYPE.DELETED elif file.status == 'renamed': edit_type = EDIT_TYPE.RENAMED elif file.status == 'modified': edit_type = EDIT_TYPE.MODIFIED else: get_logger().error(f"Unknown edit type: {file.status}") edit_type = EDIT_TYPE.UNKNOWN # count number of lines added and removed if hasattr(file, 'additions') and hasattr(file, 'deletions'): num_plus_lines = file.additions num_minus_lines = file.deletions else: patch_lines = patch.splitlines(keepends=True) num_plus_lines = len([line for line in patch_lines if line.startswith('+')]) num_minus_lines = len([line for line in patch_lines if line.startswith('-')]) file_patch_canonical_structure = FilePatchInfo(original_file_content_str, new_file_content_str, patch, file.filename, edit_type=edit_type, num_plus_lines=num_plus_lines, num_minus_lines=num_minus_lines,) diff_files.append(file_patch_canonical_structure) if invalid_files_names: get_logger().info(f"Filtered out files with invalid extensions: {invalid_files_names}") self.diff_files = diff_files try: context["diff_files"] = diff_files except Exception: pass return diff_files except Exception as e: get_logger().error(f"Failing to get diff files: {e}", artifact={"traceback": traceback.format_exc()}) raise RateLimitExceeded("Rate limit exceeded for GitHub API.") from e def publish_description(self, pr_title: str, pr_body: str): self.pr.edit(title=pr_title, body=pr_body) def get_latest_commit_url(self) -> str: return self.last_commit_id.html_url def get_comment_url(self, comment) -> str: return comment.html_url def publish_persistent_comment(self, pr_comment: str, initial_header: str, update_header: bool = True, name='review', final_update_message=True): self.publish_persistent_comment_full(pr_comment, initial_header, update_header, name, final_update_message) def publish_comment(self, pr_comment: str, is_temporary: bool = False): if not self.pr and not self.issue_main: get_logger().error("Cannot publish a comment if missing PR/Issue context") return None if is_temporary and not get_settings().config.publish_output_progress: get_logger().debug(f"Skipping publish_comment for temporary comment: {pr_comment}") return None pr_comment = self.limit_output_characters(pr_comment, self.max_comment_chars) # In case this is an issue, can publish the comment on the issue. if self.issue_main: return self.issue_main.create_comment(pr_comment) response = self.pr.create_issue_comment(pr_comment) if hasattr(response, "user") and hasattr(response.user, "login"): self.github_user_id = response.user.login response.is_temporary = is_temporary if not hasattr(self.pr, 'comments_list'): self.pr.comments_list = [] self.pr.comments_list.append(response) return response def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): body = self.limit_output_characters(body, self.max_comment_chars) self.publish_inline_comments([self.create_inline_comment(body, relevant_file, relevant_line_in_file)]) def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, absolute_position: int = None): body = self.limit_output_characters(body, self.max_comment_chars) position, absolute_position = find_line_number_of_relevant_line_in_file(self.diff_files, relevant_file.strip('`'), relevant_line_in_file, absolute_position) if position == -1: get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") subject_type = "FILE" else: subject_type = "LINE" path = relevant_file.strip() return dict(body=body, path=path, position=position) if subject_type == "LINE" else {} def publish_inline_comments(self, comments: list[dict], disable_fallback: bool = False): try: # publish all comments in a single message self.pr.create_review(commit=self.last_commit_id, comments=comments) except Exception as e: get_logger().info(f"Initially failed to publish inline comments as committable") if (getattr(e, "status", None) == 422 and not disable_fallback): pass # continue to try _publish_inline_comments_fallback_with_verification else: raise e # will end up with publishing the comments one by one try: self._publish_inline_comments_fallback_with_verification(comments) except Exception as e: get_logger().error(f"Failed to publish inline code comments fallback, error: {e}") raise e def get_review_thread_comments(self, comment_id: int) -> list[dict]: """ Retrieves all comments in the same thread as the given comment. Args: comment_id: Review comment ID Returns: List of comments in the same thread """ try: # Fetch all comments with a single API call all_comments = list(self.pr.get_comments()) # Find the target comment by ID target_comment = next((c for c in all_comments if c.id == comment_id), None) if not target_comment: return [] # Get root comment id root_comment_id = target_comment.raw_data.get("in_reply_to_id", target_comment.id) # Build the thread - include the root comment and all replies to it thread_comments = [ c for c in all_comments if c.id == root_comment_id or c.raw_data.get("in_reply_to_id") == root_comment_id ] return thread_comments except Exception as e: get_logger().exception(f"Failed to get review comments for an inline ask command", artifact={"comment_id": comment_id, "error": e}) return [] def _publish_inline_comments_fallback_with_verification(self, comments: list[dict]): """ Check each inline comment separately against the GitHub API and discard of invalid comments, then publish all the remaining valid comments in a single review. For invalid comments, also try removing the suggestion part and posting the comment just on the first line. """ verified_comments, invalid_comments = self._verify_code_comments(comments) # publish as a group the verified comments if verified_comments: try: self.pr.create_review(commit=self.last_commit_id, comments=verified_comments) except: pass # try to publish one by one the invalid comments as a one-line code comment if invalid_comments and get_settings().github.try_fix_invalid_inline_comments: fixed_comments_as_one_liner = self._try_fix_invalid_inline_comments( [comment for comment, _ in invalid_comments]) for comment in fixed_comments_as_one_liner: try: self.publish_inline_comments([comment], disable_fallback=True) get_logger().info(f"Published invalid comment as a single line comment: {comment}") except: get_logger().error(f"Failed to publish invalid comment as a single line comment: {comment}") def _verify_code_comment(self, comment: dict): is_verified = False e = None try: # event ="" # By leaving this blank, you set the review action state to PENDING input = dict(commit_id=self.last_commit_id.sha, comments=[comment]) headers, data = self.pr._requester.requestJsonAndCheck( "POST", f"{self.pr.url}/reviews", input=input) pending_review_id = data["id"] is_verified = True except Exception as err: is_verified = False pending_review_id = None e = err if pending_review_id is not None: try: self.pr._requester.requestJsonAndCheck("DELETE", f"{self.pr.url}/reviews/{pending_review_id}") except Exception: pass return is_verified, e def _verify_code_comments(self, comments: list[dict]) -> tuple[list[dict], list[tuple[dict, Exception]]]: """Very each comment against the GitHub API and return 2 lists: 1 of verified and 1 of invalid comments""" verified_comments = [] invalid_comments = [] for comment in comments: time.sleep(1) # for avoiding secondary rate limit is_verified, e = self._verify_code_comment(comment) if is_verified: verified_comments.append(comment) else: invalid_comments.append((comment, e)) return verified_comments, invalid_comments def _try_fix_invalid_inline_comments(self, invalid_comments: list[dict]) -> list[dict]: """ Try fixing invalid comments by removing the suggestion part and setting the comment just on the first line. Return only comments that have been modified in some way. This is a best-effort attempt to fix invalid comments, and should be verified accordingly. """ import copy fixed_comments = [] for comment in invalid_comments: try: fixed_comment = copy.deepcopy(comment) # avoid modifying the original comment dict for later logging if "```suggestion" in comment["body"]: fixed_comment["body"] = comment["body"].split("```suggestion")[0] if "start_line" in comment: fixed_comment["line"] = comment["start_line"] del fixed_comment["start_line"] if "start_side" in comment: fixed_comment["side"] = comment["start_side"] del fixed_comment["start_side"] if fixed_comment != comment: fixed_comments.append(fixed_comment) except Exception as e: get_logger().error(f"Failed to fix inline comment, error: {e}") return fixed_comments def publish_code_suggestions(self, code_suggestions: list) -> bool: """ Publishes code suggestions as comments on the PR. """ post_parameters_list = [] code_suggestions_validated = self.validate_comments_inside_hunks(code_suggestions) for suggestion in code_suggestions_validated: body = suggestion['body'] relevant_file = suggestion['relevant_file'] relevant_lines_start = suggestion['relevant_lines_start'] relevant_lines_end = suggestion['relevant_lines_end'] if not relevant_lines_start or relevant_lines_start == -1: get_logger().exception( f"Failed to publish code suggestion, relevant_lines_start is {relevant_lines_start}") continue if relevant_lines_end < relevant_lines_start: get_logger().exception(f"Failed to publish code suggestion, " f"relevant_lines_end is {relevant_lines_end} and " f"relevant_lines_start is {relevant_lines_start}") continue if relevant_lines_end > relevant_lines_start: post_parameters = { "body": body, "path": relevant_file, "line": relevant_lines_end, "start_line": relevant_lines_start, "start_side": "RIGHT", } else: # API is different for single line comments post_parameters = { "body": body, "path": relevant_file, "line": relevant_lines_start, "side": "RIGHT", } post_parameters_list.append(post_parameters) try: self.publish_inline_comments(post_parameters_list) return True except Exception as e: get_logger().error(f"Failed to publish code suggestion, error: {e}") return False def edit_comment(self, comment, body: str): try: body = self.limit_output_characters(body, self.max_comment_chars) comment.edit(body=body) except GithubException as e: if hasattr(e, "status") and e.status == 403: # Log as warning for permission-related issues (usually due to polling) get_logger().warning( "Failed to edit github comment due to permission restrictions", artifact={"error": e}) else: get_logger().exception(f"Failed to edit github comment", artifact={"error": e}) def edit_comment_from_comment_id(self, comment_id: int, body: str): try: # self.pr.get_issue_comment(comment_id).edit(body) body = self.limit_output_characters(body, self.max_comment_chars) headers, data_patch = self.pr._requester.requestJsonAndCheck( "PATCH", f"{self.base_url}/repos/{self.repo}/issues/comments/{comment_id}", input={"body": body} ) except Exception as e: get_logger().exception(f"Failed to edit comment, error: {e}") def reply_to_comment_from_comment_id(self, comment_id: int, body: str): try: # self.pr.get_issue_comment(comment_id).edit(body) body = self.limit_output_characters(body, self.max_comment_chars) headers, data_patch = self.pr._requester.requestJsonAndCheck( "POST", f"{self.base_url}/repos/{self.repo}/pulls/{self.pr_num}/comments/{comment_id}/replies", input={"body": body} ) except Exception as e: get_logger().exception(f"Failed to reply comment, error: {e}") def get_comment_body_from_comment_id(self, comment_id: int): try: # self.pr.get_issue_comment(comment_id).edit(body) headers, data_patch = self.pr._requester.requestJsonAndCheck( "GET", f"{self.base_url}/repos/{self.repo}/issues/comments/{comment_id}" ) return data_patch.get("body","") except Exception as e: get_logger().exception(f"Failed to edit comment, error: {e}") return None def publish_file_comments(self, file_comments: list) -> bool: try: headers, existing_comments = self.pr._requester.requestJsonAndCheck( "GET", f"{self.pr.url}/comments" ) for comment in file_comments: comment['commit_id'] = self.last_commit_id.sha comment['body'] = self.limit_output_characters(comment['body'], self.max_comment_chars) found = False for existing_comment in existing_comments: comment['commit_id'] = self.last_commit_id.sha our_app_name = get_settings().get("GITHUB.APP_NAME", "") same_comment_creator = False if self.deployment_type == 'app': same_comment_creator = our_app_name.lower() in existing_comment['user']['login'].lower() elif self.deployment_type == 'user': same_comment_creator = self.github_user_id == existing_comment['user']['login'] if existing_comment['subject_type'] == 'file' and comment['path'] == existing_comment['path'] and same_comment_creator: headers, data_patch = self.pr._requester.requestJsonAndCheck( "PATCH", f"{self.base_url}/repos/{self.repo}/pulls/comments/{existing_comment['id']}", input={"body":comment['body']} ) found = True break if not found: headers, data_post = self.pr._requester.requestJsonAndCheck( "POST", f"{self.pr.url}/comments", input=comment ) return True except Exception as e: get_logger().error(f"Failed to publish diffview file summary, error: {e}") return False def remove_initial_comment(self): try: for comment in getattr(self.pr, 'comments_list', []): if comment.is_temporary: self.remove_comment(comment) except Exception as e: get_logger().exception(f"Failed to remove initial comment, error: {e}") def remove_comment(self, comment): try: comment.delete() except Exception as e: get_logger().exception(f"Failed to remove comment, error: {e}") def get_title(self): return self.pr.title def get_languages(self): languages = self._get_repo().get_languages() return languages def get_pr_branch(self): return self.pr.head.ref def get_pr_owner_id(self) -> str | None: if not self.repo: return None return self.repo.split('/')[0] def get_pr_description_full(self): return self.pr.body def get_user_id(self): if not self.github_user_id: try: self.github_user_id = self.github_client.get_user().raw_data['login'] except Exception as e: self.github_user_id = "" # logging.exception(f"Failed to get user id, error: {e}") return self.github_user_id def get_notifications(self, since: datetime): deployment_type = get_settings().get("GITHUB.DEPLOYMENT_TYPE", "user") if deployment_type != 'user': raise ValueError("Deployment mode must be set to 'user' to get notifications") notifications = self.github_client.get_user().get_notifications(since=since) return notifications def get_issue_comments(self): return self.pr.get_issue_comments() def get_repo_settings(self): try: # contents = self.repo_obj.get_contents(".pr_agent.toml", ref=self.pr.head.sha).decoded_content # more logical to take 'pr_agent.toml' from the default branch contents = self.repo_obj.get_contents(".pr_agent.toml").decoded_content return contents except Exception: return "" def get_workspace_name(self): return self.repo.split('/')[0] def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: if disable_eyes: return None try: headers, data_patch = self.pr._requester.requestJsonAndCheck( "POST", f"{self.base_url}/repos/{self.repo}/issues/comments/{issue_comment_id}/reactions", input={"content": "eyes"} ) return data_patch.get("id", None) except Exception as e: get_logger().warning(f"Failed to add eyes reaction, error: {e}") return None def remove_reaction(self, issue_comment_id: int, reaction_id: str) -> bool: try: # self.pr.get_issue_comment(issue_comment_id).delete_reaction(reaction_id) headers, data_patch = self.pr._requester.requestJsonAndCheck( "DELETE", f"{self.base_url}/repos/{self.repo}/issues/comments/{issue_comment_id}/reactions/{reaction_id}" ) return True except Exception as e: get_logger().exception(f"Failed to remove eyes reaction, error: {e}") return False def _parse_pr_url(self, pr_url: str) -> Tuple[str, int]: parsed_url = urlparse(pr_url) if parsed_url.path.startswith('/api/v3'): parsed_url = urlparse(pr_url.replace("/api/v3", "")) path_parts = parsed_url.path.strip('/').split('/') if 'api.github.com' in parsed_url.netloc or '/api/v3' in pr_url: if len(path_parts) < 5 or path_parts[3] != 'pulls': raise ValueError("The provided URL does not appear to be a GitHub PR URL") repo_name = '/'.join(path_parts[1:3]) try: pr_number = int(path_parts[4]) except ValueError as e: raise ValueError("Unable to convert PR number to integer") from e return repo_name, pr_number if len(path_parts) < 4 or path_parts[2] != 'pull': raise ValueError("The provided URL does not appear to be a GitHub PR URL") repo_name = '/'.join(path_parts[:2]) try: pr_number = int(path_parts[3]) except ValueError as e: raise ValueError("Unable to convert PR number to integer") from e return repo_name, pr_number def _parse_issue_url(self, issue_url: str) -> Tuple[str, int]: parsed_url = urlparse(issue_url) if parsed_url.path.startswith('/api/v3'): #Check if came from github app parsed_url = urlparse(issue_url.replace("/api/v3", "")) path_parts = parsed_url.path.strip('/').split('/') if 'api.github.com' in parsed_url.netloc or '/api/v3' in issue_url: #Check if came from github app if len(path_parts) < 5 or path_parts[3] != 'issues': raise ValueError("The provided URL does not appear to be a GitHub ISSUE URL") repo_name = '/'.join(path_parts[1:3]) try: issue_number = int(path_parts[4]) except ValueError as e: raise ValueError("Unable to convert issue number to integer") from e return repo_name, issue_number if len(path_parts) < 4 or path_parts[2] != 'issues': raise ValueError("The provided URL does not appear to be a GitHub PR issue") repo_name = '/'.join(path_parts[:2]) try: issue_number = int(path_parts[3]) except ValueError as e: raise ValueError("Unable to convert issue number to integer") from e return repo_name, issue_number def _get_github_client(self): self.deployment_type = get_settings().get("GITHUB.DEPLOYMENT_TYPE", "user") self.auth = None if self.deployment_type == 'app': try: private_key = get_settings().github.private_key app_id = get_settings().github.app_id except AttributeError as e: raise ValueError("GitHub app ID and private key are required when using GitHub app deployment") from e if not self.installation_id: raise ValueError("GitHub app installation ID is required when using GitHub app deployment") auth = AppAuthentication(app_id=app_id, private_key=private_key, installation_id=self.installation_id) self.auth = auth elif self.deployment_type == 'user': try: token = get_settings().github.user_token except AttributeError as e: raise ValueError( "GitHub token is required when using user deployment. See: " "https://github.com/Codium-ai/pr-agent#method-2-run-from-source") from e self.auth = Auth.Token(token) if self.auth: return Github(auth=self.auth, base_url=self.base_url) else: raise ValueError("Could not authenticate to GitHub") def _get_repo(self): if hasattr(self, 'repo_obj') and \ hasattr(self.repo_obj, 'full_name') and \ self.repo_obj.full_name == self.repo: return self.repo_obj else: self.repo_obj = self.github_client.get_repo(self.repo) return self.repo_obj def _get_pr(self): return self._get_repo().get_pull(self.pr_num) def get_pr_file_content(self, file_path: str, branch: str) -> str: try: file_content_str = str( self._get_repo() .get_contents(file_path, ref=branch) .decoded_content.decode() ) except Exception: file_content_str = "" return file_content_str def create_or_update_pr_file( self, file_path: str, branch: str, contents="", message="" ) -> None: try: file_obj = self._get_repo().get_contents(file_path, ref=branch) sha1=file_obj.sha except Exception: sha1="" self.repo_obj.update_file( path=file_path, message=message, content=contents, sha=sha1, branch=branch, ) def _get_pr_file_content(self, file: FilePatchInfo, sha: str) -> str: return self.get_pr_file_content(file.filename, sha) def publish_labels(self, pr_types): try: label_color_map = {"Bug fix": "1d76db", "Tests": "e99695", "Bug fix with tests": "c5def5", "Enhancement": "bfd4f2", "Documentation": "d4c5f9", "Other": "d1bcf9"} post_parameters = [] for p in pr_types: color = label_color_map.get(p, "d1bcf9") # default to "Other" color post_parameters.append({"name": p, "color": color}) headers, data = self.pr._requester.requestJsonAndCheck( "PUT", f"{self.pr.issue_url}/labels", input=post_parameters ) except Exception as e: get_logger().warning(f"Failed to publish labels, error: {e}") def get_pr_labels(self, update=False): try: if not update: labels =self.pr.labels return [label.name for label in labels] else: # obtain the latest labels. Maybe they changed while the AI was running headers, labels = self.pr._requester.requestJsonAndCheck( "GET", f"{self.pr.issue_url}/labels") return [label['name'] for label in labels] except Exception as e: get_logger().exception(f"Failed to get labels, error: {e}") return [] def get_repo_labels(self): labels = self.repo_obj.get_labels() return [label for label in itertools.islice(labels, 50)] def get_commit_messages(self): """ Retrieves the commit messages of a pull request. Returns: str: A string containing the commit messages of the pull request. """ max_tokens = get_settings().get("CONFIG.MAX_COMMITS_TOKENS", None) try: commit_list = self.pr.get_commits() commit_messages = [commit.commit.message for commit in commit_list] commit_messages_str = "\n".join([f"{i + 1}. {message}" for i, message in enumerate(commit_messages)]) except Exception: commit_messages_str = "" if max_tokens: commit_messages_str = clip_tokens(commit_messages_str, max_tokens) return commit_messages_str def generate_link_to_relevant_line_number(self, suggestion) -> str: try: relevant_file = suggestion['relevant_file'].strip('`').strip("'").strip('\n') relevant_line_str = suggestion['relevant_line'].strip('\n') if not relevant_line_str: return "" position, absolute_position = find_line_number_of_relevant_line_in_file \ (self.diff_files, relevant_file, relevant_line_str) if absolute_position != -1: # # link to right file only # link = f"https://github.com/{self.repo}/blob/{self.pr.head.sha}/{relevant_file}" \ # + "#" + f"L{absolute_position}" # link to diff sha_file = hashlib.sha256(relevant_file.encode('utf-8')).hexdigest() link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{absolute_position}" return link except Exception as e: get_logger().info(f"Failed adding line link, error: {e}") return "" def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: sha_file = hashlib.sha256(relevant_file.encode('utf-8')).hexdigest() if relevant_line_start == -1: link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}" elif relevant_line_end: link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{relevant_line_start}-R{relevant_line_end}" else: link = f"{self.base_url_html}/{self.repo}/pull/{self.pr_num}/files#diff-{sha_file}R{relevant_line_start}" return link def get_lines_link_original_file(self, filepath: str, component_range: Range) -> str: """ Returns the link to the original file on GitHub that corresponds to the given filepath and component range. Args: filepath (str): The path of the file. component_range (Range): The range of lines that represent the component. Returns: str: The link to the original file on GitHub. Example: >>> filepath = "path/to/file.py" >>> component_range = Range(line_start=10, line_end=20) >>> link = get_lines_link_original_file(filepath, component_range) >>> print(link) "https://github.com/{repo}/blob/{commit_sha}/{filepath}/#L11-L21" """ line_start = component_range.line_start + 1 line_end = component_range.line_end + 1 # link = (f"https://github.com/{self.repo}/blob/{self.last_commit_id.sha}/{filepath}/" # f"#L{line_start}-L{line_end}") link = (f"{self.base_url_html}/{self.repo}/blob/{self.last_commit_id.sha}/{filepath}/" f"#L{line_start}-L{line_end}") return link def get_pr_id(self): try: pr_id = f"{self.repo}/{self.pr_num}" return pr_id except: return "" def fetch_sub_issues(self, issue_url): """ Fetch sub-issues linked to the given GitHub issue URL using GraphQL via PyGitHub. """ sub_issues = set() # Extract owner, repo, and issue number from URL parts = issue_url.rstrip("/").split("/") owner, repo, issue_number = parts[-4], parts[-3], parts[-1] try: # Gets Issue ID from Issue Number query = f""" query {{ repository(owner: "{owner}", name: "{repo}") {{ issue(number: {issue_number}) {{ id }} }} }} """ response_tuple = self.github_client._Github__requester.requestJson("POST", "/graphql", input={"query": query}) # Extract the JSON response from the tuple and parses it if isinstance(response_tuple, tuple) and len(response_tuple) == 3: response_json = json.loads(response_tuple[2]) else: get_logger().error(f"Unexpected response format: {response_tuple}") return sub_issues issue_id = response_json.get("data", {}).get("repository", {}).get("issue", {}).get("id") if not issue_id: get_logger().warning(f"Issue ID not found for {issue_url}") return sub_issues # Fetch Sub-Issues sub_issues_query = f""" query {{ node(id: "{issue_id}") {{ ... on Issue {{ subIssues(first: 10) {{ nodes {{ url }} }} }} }} }} """ sub_issues_response_tuple = self.github_client._Github__requester.requestJson("POST", "/graphql", input={ "query": sub_issues_query}) # Extract the JSON response from the tuple and parses it if isinstance(sub_issues_response_tuple, tuple) and len(sub_issues_response_tuple) == 3: sub_issues_response_json = json.loads(sub_issues_response_tuple[2]) else: get_logger().error("Unexpected sub-issues response format", artifact={"response": sub_issues_response_tuple}) return sub_issues if not sub_issues_response_json.get("data", {}).get("node", {}).get("subIssues"): get_logger().error("Invalid sub-issues response structure") return sub_issues nodes = sub_issues_response_json.get("data", {}).get("node", {}).get("subIssues", {}).get("nodes", []) get_logger().info(f"Github Sub-issues fetched: {len(nodes)}", artifact={"nodes": nodes}) for sub_issue in nodes: if "url" in sub_issue: sub_issues.add(sub_issue["url"]) except Exception as e: get_logger().exception(f"Failed to fetch sub-issues. Error: {e}") return sub_issues def auto_approve(self) -> bool: try: res = self.pr.create_review(event="APPROVE") if res.state == "APPROVED": return True return False except Exception as e: get_logger().exception(f"Failed to auto-approve, error: {e}") return False def calc_pr_statistics(self, pull_request_data: dict): return {} def validate_comments_inside_hunks(self, code_suggestions): """ validate that all committable comments are inside PR hunks - this is a must for committable comments in GitHub """ code_suggestions_copy = copy.deepcopy(code_suggestions) diff_files = self.get_diff_files() RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") diff_files = set_file_languages(diff_files) for suggestion in code_suggestions_copy: try: relevant_file_path = suggestion['relevant_file'] for file in diff_files: if file.filename == relevant_file_path: # generate on-demand the patches range for the relevant file patch_str = file.patch if not hasattr(file, 'patches_range'): file.patches_range = [] patch_lines = patch_str.splitlines() for i, line in enumerate(patch_lines): if line.startswith('@@'): match = RE_HUNK_HEADER.match(line) # identify hunk header if match: section_header, size1, size2, start1, start2 = extract_hunk_headers(match) file.patches_range.append({'start': start2, 'end': start2 + size2 - 1}) patches_range = file.patches_range comment_start_line = suggestion.get('relevant_lines_start', None) comment_end_line = suggestion.get('relevant_lines_end', None) original_suggestion = suggestion.get('original_suggestion', None) # needed for diff code if not comment_start_line or not comment_end_line or not original_suggestion: continue # check if the comment is inside a valid hunk is_valid_hunk = False min_distance = float('inf') patch_range_min = None # find the hunk that contains the comment, or the closest one for i, patch_range in enumerate(patches_range): d1 = comment_start_line - patch_range['start'] d2 = patch_range['end'] - comment_end_line if d1 >= 0 and d2 >= 0: # found a valid hunk is_valid_hunk = True min_distance = 0 patch_range_min = patch_range break elif d1 * d2 <= 0: # comment is possibly inside the hunk d1_clip = abs(min(0, d1)) d2_clip = abs(min(0, d2)) d = max(d1_clip, d2_clip) if d < min_distance: patch_range_min = patch_range min_distance = min(min_distance, d) if not is_valid_hunk: if min_distance < 10: # 10 lines - a reasonable distance to consider the comment inside the hunk # make the suggestion non-committable, yet multi line suggestion['relevant_lines_start'] = max(suggestion['relevant_lines_start'], patch_range_min['start']) suggestion['relevant_lines_end'] = min(suggestion['relevant_lines_end'], patch_range_min['end']) body = suggestion['body'].strip() # present new diff code in collapsible existing_code = original_suggestion['existing_code'].rstrip() + "\n" improved_code = original_suggestion['improved_code'].rstrip() + "\n" diff = difflib.unified_diff(existing_code.split('\n'), improved_code.split('\n'), n=999) patch_orig = "\n".join(diff) patch = "\n".join(patch_orig.splitlines()[5:]).strip('\n') diff_code = f"\n\n
      New proposed code:\n\n```diff\n{patch.rstrip()}\n```" # replace ```suggestion ... ``` with diff_code, using regex: body = re.sub(r'```suggestion.*?```', diff_code, body, flags=re.DOTALL) body += "\n\n
      " suggestion['body'] = body get_logger().info(f"Comment was moved to a valid hunk, " f"start_line={suggestion['relevant_lines_start']}, end_line={suggestion['relevant_lines_end']}, file={file.filename}") else: get_logger().error(f"Comment is not inside a valid hunk, " f"start_line={suggestion['relevant_lines_start']}, end_line={suggestion['relevant_lines_end']}, file={file.filename}") except Exception as e: get_logger().error(f"Failed to process patch for committable comment, error: {e}") return code_suggestions_copy #Clone related def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None: scheme = "https://" #For example, to clone: #https://github.com/Codium-ai/pr-agent-pro.git #Need to embed inside the github token: #https://@github.com/Codium-ai/pr-agent-pro.git github_token = self.auth.token github_base_url = self.base_url_html if not all([github_token, github_base_url]): get_logger().error("Either missing auth token or missing base url") return None if scheme not in github_base_url: get_logger().error(f"Base url: {github_base_url} is missing prefix: {scheme}") return None github_com = github_base_url.split(scheme)[1] # e.g. 'github.com' or github..com if not github_com: get_logger().error(f"Base url: {github_base_url} has an empty base url") return None if github_com not in repo_url_to_clone: get_logger().error(f"url to clone: {repo_url_to_clone} does not contain {github_com}") return None repo_full_name = repo_url_to_clone.split(github_com)[-1] if not repo_full_name: get_logger().error(f"url to clone: {repo_url_to_clone} is malformed") return None clone_url = scheme if self.deployment_type == 'app': clone_url += "git:" clone_url += f"{github_token}@{github_com}{repo_full_name}" return clone_url ================================================ FILE: pr_agent/git_providers/gitlab_provider.py ================================================ import difflib import hashlib import re import urllib.parse from typing import Any, Optional, Tuple, Union from urllib.parse import parse_qs, urlparse import gitlab import requests from gitlab import (GitlabAuthenticationError, GitlabCreateError, GitlabGetError, GitlabUpdateError) from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from ..algo.file_filter import filter_ignored from ..algo.git_patch_processing import decode_if_bytes from ..algo.language_handler import is_valid_file from ..algo.utils import (clip_tokens, find_line_number_of_relevant_line_in_file, load_large_diff) from ..config_loader import get_settings from ..log import get_logger from .git_provider import MAX_FILES_ALLOWED_FULL, GitProvider class DiffNotFoundError(Exception): """Raised when the diff for a merge request cannot be found.""" pass class GitLabProvider(GitProvider): def __init__(self, merge_request_url: Optional[str] = None, incremental: Optional[bool] = False): gitlab_url = get_settings().get("GITLAB.URL", None) if not gitlab_url: raise ValueError("GitLab URL is not set in the config file") self.gitlab_url = gitlab_url ssl_verify = get_settings().get("GITLAB.SSL_VERIFY", True) gitlab_access_token = get_settings().get("GITLAB.PERSONAL_ACCESS_TOKEN", None) if not gitlab_access_token: raise ValueError("GitLab personal access token is not set in the config file") # Authentication method selection via configuration auth_method = get_settings().get("GITLAB.AUTH_TYPE", "oauth_token") # Basic validation of authentication type if auth_method not in ["oauth_token", "private_token"]: raise ValueError(f"Unsupported GITLAB.AUTH_TYPE: '{auth_method}'. " f"Must be 'oauth_token' or 'private_token'.") # Create GitLab instance based on authentication method try: if auth_method == "oauth_token": self.gl = gitlab.Gitlab( url=gitlab_url, oauth_token=gitlab_access_token, ssl_verify=ssl_verify ) else: # private_token self.gl = gitlab.Gitlab( url=gitlab_url, private_token=gitlab_access_token, ssl_verify=ssl_verify ) except Exception as e: get_logger().error(f"Failed to create GitLab instance: {e}") raise ValueError(f"Unable to authenticate with GitLab: {e}") self.max_comment_chars = 65000 self.id_project = None self.id_mr = None self.mr = None self.diff_files = None self.git_files = None self.temp_comments = [] self._submodule_cache: dict[tuple[str, str, str], list[dict]] = {} self.pr_url = merge_request_url self._set_merge_request(merge_request_url) self.RE_HUNK_HEADER = re.compile( r"^@@ -(\d+)(?:,(\d+))? \+(\d+)(?:,(\d+))? @@[ ]?(.*)") self.incremental = incremental # --- submodule expansion helpers (opt-in) --- def _get_gitmodules_map(self) -> dict[str, str]: """ Return {submodule_path -> repo_url} from '.gitmodules' (best effort). Tries target branch first, then source branch. Always returns text. """ try: proj = self.gl.projects.get(self.id_project) except Exception: return {} import base64 def _read_text(ref: str | None) -> str | None: if not ref: return None try: f = proj.files.get(file_path=".gitmodules", ref=ref) except Exception: return None # 1) python-gitlab File.decode() – usually returns BYTES try: raw = f.decode() if isinstance(raw, (bytes, bytearray)): return raw.decode("utf-8", "ignore") if isinstance(raw, str): return raw except Exception: pass # 2) fallback: base64 decode f.content try: c = getattr(f, "content", None) if c: return base64.b64decode(c).decode("utf-8", "ignore") except Exception: pass return None content = ( _read_text(getattr(self.mr, "target_branch", None)) or _read_text(getattr(self.mr, "source_branch", None)) ) if not content: return {} import configparser parser = configparser.ConfigParser( delimiters=("=",), interpolation=None, inline_comment_prefixes=("#", ";"), strict=False, ) try: parser.read_string(content) except Exception: return {} out: dict[str, str] = {} for section in parser.sections(): if not section.lower().startswith("submodule"): continue path = parser.get(section, "path", fallback=None) url = parser.get(section, "url", fallback=None) if path and url: path = path.strip().strip('"').strip("'") url = url.strip().strip('"').strip("'") out[path] = url return out def _url_to_project_path(self, url: str) -> str | None: """ Convert ssh/https GitLab URL to 'group/subgroup/repo' project path. """ try: if url.startswith("git@") and ":" in url: path = url.split(":", 1)[1] else: path = urllib.parse.urlparse(url).path.lstrip("/") if path.endswith(".git"): path = path[:-4] return path or None except Exception: return None def _project_by_path(self, proj_path: str): """ Resolve a project by path with multiple strategies: 1) URL-encoded path_with_namespace 2) Raw path_with_namespace 3) Search fallback + exact match on path_with_namespace (case-insensitive) Returns a project object or None. """ if not proj_path: return None # 1) Encoded try: enc = urllib.parse.quote_plus(proj_path) return self.gl.projects.get(enc) except Exception: pass # 2) Raw try: return self.gl.projects.get(proj_path) except Exception: pass # 3) Search fallback try: name = proj_path.split("/")[-1] # membership=True so we don't leak other people's repos matches = self.gl.projects.list(search=name, simple=True, membership=True, per_page=100) # prefer exact path_with_namespace match (case-insensitive) for p in matches: pwn = getattr(p, "path_with_namespace", "") if pwn.lower() == proj_path.lower(): return self.gl.projects.get(p.id) if matches: get_logger().warning(f"[submodule] no exact match for {proj_path} (skip)") except Exception: pass return None def _compare_submodule(self, proj_path: str, old_sha: str, new_sha: str) -> list[dict]: """ Call repository_compare on submodule project; return list of diffs. """ key = (proj_path, old_sha, new_sha) if key in self._submodule_cache: return self._submodule_cache[key] try: proj = self._project_by_path(proj_path) if proj is None: get_logger().warning(f"[submodule] resolve failed for {proj_path}") self._submodule_cache[key] = [] return [] cmp = proj.repository_compare(old_sha, new_sha) if isinstance(cmp, dict): diffs = cmp.get("diffs", []) or [] else: diffs = [] self._submodule_cache[key] = diffs return diffs except Exception as e: get_logger().warning(f"[submodule] compare failed for {proj_path} {old_sha}..{new_sha}: {e}") self._submodule_cache[key] = [] return [] def _expand_submodule_changes(self, changes: list[dict]) -> list[dict]: """ If enabled, expand 'Subproject commit' bumps into real file diffs from the submodule. Soft-fail on any issue. """ try: if not bool(get_settings().get("GITLAB.EXPAND_SUBMODULE_DIFFS", False)): return changes except Exception: return changes gitmodules = self._get_gitmodules_map() if not gitmodules: return changes out = list(changes) for ch in changes: patch = ch.get("diff") or "" if "Subproject commit" not in patch: continue # Extract old/new SHAs from the hunk old_m = re.search(r"^-Subproject commit ([0-9a-f]{7,40})", patch, re.M) new_m = re.search(r"^\+Subproject commit ([0-9a-f]{7,40})", patch, re.M) if not (old_m and new_m): continue old_sha, new_sha = old_m.group(1), new_m.group(1) sub_path = ch.get("new_path") or ch.get("old_path") or "" repo_url = gitmodules.get(sub_path) if not repo_url: get_logger().warning(f"[submodule] no url for '{sub_path}' in .gitmodules (skip)") continue proj_path = self._url_to_project_path(repo_url) if not proj_path: get_logger().warning(f"[submodule] cannot parse project path from url '{repo_url}' (skip)") continue get_logger().info(f"[submodule] {sub_path} url={repo_url} -> proj_path={proj_path}") sub_diffs = self._compare_submodule(proj_path, old_sha, new_sha) for sd in sub_diffs: sd_diff = sd.get("diff") or "" sd_old = sd.get("old_path") or sd.get("a_path") or "" sd_new = sd.get("new_path") or sd.get("b_path") or sd_old out.append({ "old_path": f"{sub_path}/{sd_old}" if sd_old else sub_path, "new_path": f"{sub_path}/{sd_new}" if sd_new else sub_path, "diff": sd_diff, "new_file": sd.get("new_file", False), "deleted_file": sd.get("deleted_file", False), "renamed_file": sd.get("renamed_file", False), }) return out def is_supported(self, capability: str) -> bool: if capability in ['get_issue_comments', 'create_inline_comment', 'publish_inline_comments', 'publish_file_comments']: # gfm_markdown is supported in gitlab ! return False return True def _get_project_path_from_pr_or_issue_url(self, pr_or_issue_url: str) -> str: repo_project_path = None if 'issues' in pr_or_issue_url: #replace 'issues' with 'merge_requests', since gitlab provider does not support issue urls, just to get the git repo url: pr_or_issue_url = pr_or_issue_url.replace('issues', 'merge_requests') if 'merge_requests' in pr_or_issue_url: repo_project_path, _ = self._parse_merge_request_url(pr_or_issue_url) if not repo_project_path: get_logger().error(f"url is not a valid merge requests url: {pr_or_issue_url}") return "" return repo_project_path def get_git_repo_url(self, issues_or_pr_url: str) -> str: provider_url = issues_or_pr_url repo_path = self._get_project_path_from_pr_or_issue_url(provider_url) if not repo_path or repo_path not in issues_or_pr_url: get_logger().error(f"Unable to retrieve project path from url: {issues_or_pr_url}") return "" return f"{issues_or_pr_url.split(repo_path)[0]}{repo_path}.git" # Given a git repo url, return prefix and suffix of the provider in order to view a given file belonging to that repo. # Example: https://gitlab.com/codiumai/pr-agent.git and branch: t1 -> prefix: "https://gitlab.com/codiumai/pr-agent/-/blob/t1", suffix: "?ref_type=heads" # In case git url is not provided, provider will use PR context (which includes branch) to determine the prefix and suffix. def get_canonical_url_parts(self, repo_git_url:str=None, desired_branch:str=None) -> Tuple[str, str]: repo_path = "" if not repo_git_url and not self.pr_url: get_logger().error("Cannot get canonical URL parts: missing either context PR URL or a repo GIT URL") return ("", "") if not repo_git_url: #Use PR url as context repo_path = self._get_project_path_from_pr_or_issue_url(self.pr_url) try: desired_branch = self.gl.projects.get(self.id_project).default_branch except Exception as e: get_logger().exception(f"Cannot get PR: {self.pr_url} default branch. Tried project ID: {self.id_project}") return ("", "") else: #Use repo git url repo_path = repo_git_url.split('.git')[0].split('.com/')[-1] prefix = f"{self.gitlab_url}/{repo_path}/-/blob/{desired_branch}" suffix = "?ref_type=heads" # gitlab cloud adds this suffix. gitlab server does not, but it is harmless. return (prefix, suffix) @property def pr(self): '''The GitLab terminology is merge request (MR) instead of pull request (PR)''' return self.mr def _set_merge_request(self, merge_request_url: str): self.id_project, self.id_mr = self._parse_merge_request_url(merge_request_url) self.mr = self._get_merge_request() try: self.last_diff = self.mr.diffs.list(get_all=True)[-1] except IndexError as e: get_logger().error(f"Could not get diff for merge request {self.id_mr}") raise DiffNotFoundError(f"Could not get diff for merge request {self.id_mr}") from e def get_pr_file_content(self, file_path: str, branch: str) -> str: try: file_obj = self.gl.projects.get(self.id_project).files.get(file_path, branch) content = file_obj.decode() return decode_if_bytes(content) except GitlabGetError: # In case of file creation the method returns GitlabGetError (404 file not found). # In this case we return an empty string for the diff. return '' except Exception as e: get_logger().warning(f"Error retrieving file {file_path} from branch {branch}: {e}") return '' def create_or_update_pr_file(self, file_path: str, branch: str, contents="", message="") -> None: """Create or update a file in the GitLab repository.""" try: project = self.gl.projects.get(self.id_project) if not message: action = "Update" if contents else "Create" message = f"{action} {file_path}" try: existing_file = project.files.get(file_path, branch) existing_file.content = contents existing_file.save(branch=branch, commit_message=message) get_logger().debug(f"Updated file {file_path} in branch {branch}") except GitlabGetError: project.files.create({ 'file_path': file_path, 'branch': branch, 'content': contents, 'commit_message': message }) get_logger().debug(f"Created file {file_path} in branch {branch}") except GitlabAuthenticationError as e: get_logger().error(f"Authentication failed while creating/updating file {file_path} in branch {branch}: {e}") raise except (GitlabCreateError, GitlabUpdateError) as e: get_logger().error(f"Permission denied or validation error for file {file_path} in branch {branch}: {e}") raise except Exception as e: get_logger().exception(f"Unexpected error creating/updating file {file_path} in branch {branch}: {e}") raise def get_diff_files(self) -> list[FilePatchInfo]: """ Retrieves the list of files that have been modified, added, deleted, or renamed in a pull request in GitLab, along with their content and patch information. Returns: diff_files (List[FilePatchInfo]): List of FilePatchInfo objects representing the modified, added, deleted, or renamed files in the merge request. """ if self.diff_files: return self.diff_files # filter files using [ignore] patterns raw_changes = self.mr.changes().get('changes', []) raw_changes = self._expand_submodule_changes(raw_changes) diffs_original = raw_changes diffs = filter_ignored(diffs_original, 'gitlab') if diffs != diffs_original: try: names_original = [diff['new_path'] for diff in diffs_original] names_filtered = [diff['new_path'] for diff in diffs] get_logger().info(f"Filtered out [ignore] files for merge request {self.id_mr}", extra={ 'original_files': names_original, 'filtered_files': names_filtered }) except Exception as e: pass diff_files = [] invalid_files_names = [] counter_valid = 0 for diff in diffs: if not is_valid_file(diff['new_path']): invalid_files_names.append(diff['new_path']) continue # allow only a limited number of files to be fully loaded. We can manage the rest with diffs only counter_valid += 1 if counter_valid < MAX_FILES_ALLOWED_FULL or not diff['diff']: original_file_content_str = self.get_pr_file_content(diff['old_path'], self.mr.diff_refs['base_sha']) new_file_content_str = self.get_pr_file_content(diff['new_path'], self.mr.diff_refs['head_sha']) else: if counter_valid == MAX_FILES_ALLOWED_FULL: get_logger().info(f"Too many files in PR, will avoid loading full content for rest of files") original_file_content_str = '' new_file_content_str = '' # Ensure content is properly decoded original_file_content_str = decode_if_bytes(original_file_content_str) new_file_content_str = decode_if_bytes(new_file_content_str) edit_type = EDIT_TYPE.MODIFIED if diff['new_file']: edit_type = EDIT_TYPE.ADDED elif diff['deleted_file']: edit_type = EDIT_TYPE.DELETED elif diff['renamed_file']: edit_type = EDIT_TYPE.RENAMED filename = diff['new_path'] patch = diff['diff'] if not patch: patch = load_large_diff(filename, new_file_content_str, original_file_content_str) # count number of lines added and removed patch_lines = patch.splitlines(keepends=True) num_plus_lines = len([line for line in patch_lines if line.startswith('+')]) num_minus_lines = len([line for line in patch_lines if line.startswith('-')]) diff_files.append( FilePatchInfo(original_file_content_str, new_file_content_str, patch=patch, filename=filename, edit_type=edit_type, old_filename=None if diff['old_path'] == diff['new_path'] else diff['old_path'], num_plus_lines=num_plus_lines, num_minus_lines=num_minus_lines, )) if invalid_files_names: get_logger().info(f"Filtered out files with invalid extensions: {invalid_files_names}") self.diff_files = diff_files return diff_files def get_files(self) -> list: if not self.git_files: raw_changes = self.mr.changes().get('changes', []) raw_changes = self._expand_submodule_changes(raw_changes) self.git_files = [c.get('new_path') for c in raw_changes if c.get('new_path')] return self.git_files def publish_description(self, pr_title: str, pr_body: str): try: self.mr.title = pr_title self.mr.description = pr_body self.mr.save() except Exception as e: get_logger().exception(f"Could not update merge request {self.id_mr} description: {e}") def get_latest_commit_url(self): try: return self.mr.commits().next().web_url except StopIteration: # no commits return "" except Exception as e: get_logger().exception(f"Could not get latest commit URL: {e}") return "" def get_comment_url(self, comment): return f"{self.mr.web_url}#note_{comment.id}" def publish_persistent_comment(self, pr_comment: str, initial_header: str, update_header: bool = True, name='review', final_update_message=True): self.publish_persistent_comment_full(pr_comment, initial_header, update_header, name, final_update_message) def publish_comment(self, mr_comment: str, is_temporary: bool = False): if is_temporary and not get_settings().config.publish_output_progress: get_logger().debug(f"Skipping publish_comment for temporary comment: {mr_comment}") return None mr_comment = self.limit_output_characters(mr_comment, self.max_comment_chars) comment = self.mr.notes.create({'body': mr_comment}) if is_temporary: self.temp_comments.append(comment) return comment def edit_comment(self, comment, body: str): body = self.limit_output_characters(body, self.max_comment_chars) self.mr.notes.update(comment.id,{'body': body} ) def edit_comment_from_comment_id(self, comment_id: int, body: str): body = self.limit_output_characters(body, self.max_comment_chars) comment = self.mr.notes.get(comment_id) comment.body = body comment.save() def reply_to_comment_from_comment_id(self, comment_id: int, body: str): body = self.limit_output_characters(body, self.max_comment_chars) discussion = self.mr.discussions.get(comment_id) discussion.notes.create({'body': body}) def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): body = self.limit_output_characters(body, self.max_comment_chars) edit_type, found, source_line_no, target_file, target_line_no = self.search_line(relevant_file, relevant_line_in_file) self.send_inline_comment(body, edit_type, found, relevant_file, relevant_line_in_file, source_line_no, target_file, target_line_no, original_suggestion) def create_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, absolute_position: int = None): raise NotImplementedError("Gitlab provider does not support creating inline comments yet") def create_inline_comments(self, comments: list[dict]): raise NotImplementedError("Gitlab provider does not support publishing inline comments yet") def get_comment_body_from_comment_id(self, comment_id: int): comment = self.mr.notes.get(comment_id).body return comment def send_inline_comment(self, body: str, edit_type: str, found: bool, relevant_file: str, relevant_line_in_file: str, source_line_no: int, target_file: str, target_line_no: int, original_suggestion=None) -> None: if not found: get_logger().info(f"Could not find position for {relevant_file} {relevant_line_in_file}") else: # in order to have exact sha's we have to find correct diff for this change diff = self.get_relevant_diff(relevant_file, relevant_line_in_file) if diff is None: get_logger().error(f"Could not get diff for merge request {self.id_mr}") raise DiffNotFoundError(f"Could not get diff for merge request {self.id_mr}") pos_obj = {'position_type': 'text', 'new_path': target_file.filename, 'old_path': target_file.old_filename if target_file.old_filename else target_file.filename, 'base_sha': diff.base_commit_sha, 'start_sha': diff.start_commit_sha, 'head_sha': diff.head_commit_sha} if edit_type == 'deletion': pos_obj['old_line'] = source_line_no - 1 elif edit_type == 'addition': pos_obj['new_line'] = target_line_no - 1 else: pos_obj['new_line'] = target_line_no - 1 pos_obj['old_line'] = source_line_no - 1 get_logger().debug(f"Creating comment in MR {self.id_mr} with body {body} and position {pos_obj}") try: self.mr.discussions.create({'body': body, 'position': pos_obj}) except Exception as e: try: # fallback - create a general note on the file in the MR if 'suggestion_orig_location' in original_suggestion: line_start = original_suggestion['suggestion_orig_location']['start_line'] line_end = original_suggestion['suggestion_orig_location']['end_line'] old_code_snippet = original_suggestion['prev_code_snippet'] new_code_snippet = original_suggestion['new_code_snippet'] content = original_suggestion['suggestion_summary'] label = original_suggestion['category'] if 'score' in original_suggestion: score = original_suggestion['score'] else: score = 7 else: line_start = original_suggestion['relevant_lines_start'] line_end = original_suggestion['relevant_lines_end'] old_code_snippet = original_suggestion['existing_code'] new_code_snippet = original_suggestion['improved_code'] content = original_suggestion['suggestion_content'] label = original_suggestion['label'] score = original_suggestion.get('score', 7) if hasattr(self, 'main_language'): language = self.main_language else: language = '' link = self.get_line_link(relevant_file, line_start, line_end) body_fallback =f"**Suggestion:** {content} [{label}, importance: {score}]\n\n" body_fallback +=f"\n\n
      [{target_file.filename} [{line_start}-{line_end}]]({link}):\n\n" body_fallback += f"\n\n___\n\n`(Cannot implement directly - GitLab API allows committable suggestions strictly on MR diff lines)`" body_fallback+="
      \n\n" diff_patch = difflib.unified_diff(old_code_snippet.split('\n'), new_code_snippet.split('\n'), n=999) patch_orig = "\n".join(diff_patch) patch = "\n".join(patch_orig.splitlines()[5:]).strip('\n') diff_code = f"\n\n```diff\n{patch.rstrip()}\n```" body_fallback += diff_code # Create a general note on the file in the MR self.mr.notes.create({ 'body': body_fallback, 'position': { 'base_sha': diff.base_commit_sha, 'start_sha': diff.start_commit_sha, 'head_sha': diff.head_commit_sha, 'position_type': 'text', 'file_path': f'{target_file.filename}', } }) get_logger().debug(f"Created fallback comment in MR {self.id_mr} with position {pos_obj}") # get_logger().debug( # f"Failed to create comment in MR {self.id_mr} with position {pos_obj} (probably not a '+' line)") except Exception as e: get_logger().exception(f"Failed to create comment in MR {self.id_mr}") def get_relevant_diff(self, relevant_file: str, relevant_line_in_file: str) -> Optional[dict]: _changes = self.mr.changes() # dict _changes['changes'] = self._expand_submodule_changes(_changes.get('changes', [])) changes = _changes if not changes: get_logger().error('No changes found for the merge request.') return None all_diffs = self.mr.diffs.list(get_all=True) if not all_diffs: get_logger().error('No diffs found for the merge request.') return None for diff in all_diffs: for change in changes['changes']: if change['new_path'] == relevant_file and relevant_line_in_file in change['diff']: return diff get_logger().debug( f'No relevant diff found for {relevant_file} {relevant_line_in_file}. Falling back to last diff.') return self.last_diff # fallback to last_diff if no relevant diff is found def publish_code_suggestions(self, code_suggestions: list) -> bool: for suggestion in code_suggestions: try: if suggestion and 'original_suggestion' in suggestion: original_suggestion = suggestion['original_suggestion'] else: original_suggestion = suggestion body = suggestion['body'] relevant_file = suggestion['relevant_file'] relevant_lines_start = suggestion['relevant_lines_start'] relevant_lines_end = suggestion['relevant_lines_end'] diff_files = self.get_diff_files() target_file = None for file in diff_files: if file.filename == relevant_file: if file.filename == relevant_file: target_file = file break range = relevant_lines_end - relevant_lines_start # no need to add 1 body = body.replace('```suggestion', f'```suggestion:-0+{range}') lines = target_file.head_file.splitlines() relevant_line_in_file = lines[relevant_lines_start - 1] # edit_type, found, source_line_no, target_file, target_line_no = self.find_in_file(target_file, # relevant_line_in_file) # for code suggestions, we want to edit the new code source_line_no = -1 target_line_no = relevant_lines_start + 1 found = True edit_type = 'addition' self.send_inline_comment(body, edit_type, found, relevant_file, relevant_line_in_file, source_line_no, target_file, target_line_no, original_suggestion) except Exception as e: get_logger().exception(f"Could not publish code suggestion:\nsuggestion: {suggestion}\nerror: {e}") # note that we publish suggestions one-by-one. so, if one fails, the rest will still be published return True def publish_file_comments(self, file_comments: list) -> bool: pass def search_line(self, relevant_file, relevant_line_in_file): target_file = None edit_type = self.get_edit_type(relevant_line_in_file) for file in self.get_diff_files(): if file.filename == relevant_file: edit_type, found, source_line_no, target_file, target_line_no = self.find_in_file(file, relevant_line_in_file) return edit_type, found, source_line_no, target_file, target_line_no def find_in_file(self, file, relevant_line_in_file): edit_type = 'context' source_line_no = 0 target_line_no = 0 found = False target_file = file patch = file.patch patch_lines = patch.splitlines() for line in patch_lines: if line.startswith('@@'): match = self.RE_HUNK_HEADER.match(line) if not match: continue start_old, size_old, start_new, size_new, _ = match.groups() source_line_no = int(start_old) target_line_no = int(start_new) continue if line.startswith('-'): source_line_no += 1 elif line.startswith('+'): target_line_no += 1 elif line.startswith(' '): source_line_no += 1 target_line_no += 1 if relevant_line_in_file in line: found = True edit_type = self.get_edit_type(line) break elif relevant_line_in_file[0] == '+' and relevant_line_in_file[1:].lstrip() in line: # The model often adds a '+' to the beginning of the relevant_line_in_file even if originally # it's a context line found = True edit_type = self.get_edit_type(line) break return edit_type, found, source_line_no, target_file, target_line_no def get_edit_type(self, relevant_line_in_file): edit_type = 'context' if relevant_line_in_file[0] == '-': edit_type = 'deletion' elif relevant_line_in_file[0] == '+': edit_type = 'addition' return edit_type def remove_initial_comment(self): try: for comment in self.temp_comments: self.remove_comment(comment) except Exception as e: get_logger().exception(f"Failed to remove temp comments, error: {e}") def remove_comment(self, comment): try: comment.delete() except Exception as e: get_logger().exception(f"Failed to remove comment, error: {e}") def get_title(self): return self.mr.title def get_languages(self): languages = self.gl.projects.get(self.id_project).languages() return languages def get_pr_branch(self): return self.mr.source_branch def get_pr_owner_id(self) -> str | None: if not self.gitlab_url or 'gitlab.com' in self.gitlab_url: if not self.id_project: return None return self.id_project.split('/')[0] # extract host name host = urlparse(self.gitlab_url).hostname return host def get_pr_description_full(self): return self.mr.description def get_issue_comments(self): return self.mr.notes.list(get_all=True)[::-1] def get_repo_settings(self): try: main_branch = self.gl.projects.get(self.id_project).default_branch contents = self.gl.projects.get(self.id_project).files.get(file_path='.pr_agent.toml', ref=main_branch).decode() return contents except Exception: return "" def get_workspace_name(self): return self.id_project.split('/')[0] def add_eyes_reaction(self, issue_comment_id: int, disable_eyes: bool = False) -> Optional[int]: if disable_eyes: return None try: if not self.id_mr: get_logger().warning("Cannot add eyes reaction: merge request ID is not set.") return None mr = self.gl.projects.get(self.id_project).mergerequests.get(self.id_mr) comment = mr.notes.get(issue_comment_id) if not comment: get_logger().warning(f"Comment with ID {issue_comment_id} not found in merge request {self.id_mr}.") return None award_emoji = comment.awardemojis.create({ 'name': 'eyes' }) return award_emoji.id except Exception as e: get_logger().warning(f"Failed to add eyes reaction, error: {e}") return None def remove_reaction(self, issue_comment_id: int, reaction_id: str) -> bool: try: if not self.id_mr: get_logger().warning("Cannot remove reaction: merge request ID is not set.") return False mr = self.gl.projects.get(self.id_project).mergerequests.get(self.id_mr) comment = mr.notes.get(issue_comment_id) if not comment: get_logger().warning(f"Comment with ID {issue_comment_id} not found in merge request {self.id_mr}.") return False reactions = comment.awardemojis.list() for reaction in reactions: if reaction.name == reaction_id: reaction.delete() return True get_logger().warning(f"Reaction '{reaction_id}' not found in comment {issue_comment_id}.") return False except Exception as e: get_logger().warning(f"Failed to remove reaction, error: {e}") return False def _parse_merge_request_url(self, merge_request_url: str) -> Tuple[str, int]: parsed_url = urlparse(merge_request_url) path_parts = parsed_url.path.strip('/').split('/') if 'merge_requests' not in path_parts: raise ValueError("The provided URL does not appear to be a GitLab merge request URL") mr_index = path_parts.index('merge_requests') # Ensure there is an ID after 'merge_requests' if len(path_parts) <= mr_index + 1: raise ValueError("The provided URL does not contain a merge request ID") try: mr_id = int(path_parts[mr_index + 1]) except ValueError as e: raise ValueError("Unable to convert merge request ID to integer") from e # Handle special delimiter (-) project_path = "/".join(path_parts[:mr_index]) if project_path.endswith('/-'): project_path = project_path[:-2] # Return the path before 'merge_requests' and the ID return project_path, mr_id def _get_merge_request(self): mr = self.gl.projects.get(self.id_project).mergerequests.get(self.id_mr) return mr def get_user_id(self): return None def publish_labels(self, pr_types): try: self.mr.labels = list(set(pr_types)) self.mr.save() except Exception as e: get_logger().warning(f"Failed to publish labels, error: {e}") def publish_inline_comments(self, comments: list[dict]): pass def get_pr_labels(self, update=False): return self.mr.labels def get_repo_labels(self): return self.gl.projects.get(self.id_project).labels.list() def get_commit_messages(self): """ Retrieves the commit messages of a pull request. Returns: str: A string containing the commit messages of the pull request. """ max_tokens = get_settings().get("CONFIG.MAX_COMMITS_TOKENS", None) try: commit_messages_list = [commit['message'] for commit in self.mr.commits()._list] commit_messages_str = "\n".join([f"{i + 1}. {message}" for i, message in enumerate(commit_messages_list)]) except Exception: commit_messages_str = "" if max_tokens: commit_messages_str = clip_tokens(commit_messages_str, max_tokens) return commit_messages_str def get_pr_id(self): try: pr_id = self.mr.web_url return pr_id except: return "" def get_line_link(self, relevant_file: str, relevant_line_start: int, relevant_line_end: int = None) -> str: if relevant_line_start == -1: link = f"{self.gl.url}/{self.id_project}/-/blob/{self.mr.source_branch}/{relevant_file}?ref_type=heads" elif relevant_line_end: link = f"{self.gl.url}/{self.id_project}/-/blob/{self.mr.source_branch}/{relevant_file}?ref_type=heads#L{relevant_line_start}-{relevant_line_end}" else: link = f"{self.gl.url}/{self.id_project}/-/blob/{self.mr.source_branch}/{relevant_file}?ref_type=heads#L{relevant_line_start}" return link def generate_link_to_relevant_line_number(self, suggestion) -> str: try: relevant_file = suggestion['relevant_file'].strip('`').strip("'").rstrip() relevant_line_str = suggestion['relevant_line'].rstrip() if not relevant_line_str: return "" position, absolute_position = find_line_number_of_relevant_line_in_file \ (self.diff_files, relevant_file, relevant_line_str) if absolute_position != -1: # link to right file only link = f"{self.gl.url}/{self.id_project}/-/blob/{self.mr.source_branch}/{relevant_file}?ref_type=heads#L{absolute_position}" # # link to diff # sha_file = hashlib.sha1(relevant_file.encode('utf-8')).hexdigest() # link = f"{self.pr.web_url}/diffs#{sha_file}_{absolute_position}_{absolute_position}" return link except Exception as e: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Failed adding line link, error: {e}") return "" #Clone related def _prepare_clone_url_with_token(self, repo_url_to_clone: str) -> str | None: if "gitlab." not in repo_url_to_clone: get_logger().error(f"Repo URL: {repo_url_to_clone} is not a valid gitlab URL.") return None (scheme, base_url) = repo_url_to_clone.split("gitlab.") access_token = getattr(self.gl, 'oauth_token', None) or getattr(self.gl, 'private_token', None) if not all([scheme, access_token, base_url]): get_logger().error(f"Either no access token found, or repo URL: {repo_url_to_clone} " f"is missing prefix: {scheme} and/or base URL: {base_url}.") return None #Note that the ""official"" method found here: # https://docs.gitlab.com/user/profile/personal_access_tokens/#clone-repository-using-personal-access-token # requires a username, which may not be applicable. # The following solution is taken from: https://stackoverflow.com/questions/25409700/using-gitlab-token-to-clone-without-authentication/35003812#35003812 # For example: For repo url: https://gitlab.codium-inc.com/qodo/autoscraper.git # Then to clone one will issue: 'git clone https://oauth2:@gitlab.codium-inc.com/qodo/autoscraper.git' clone_url = f"{scheme}oauth2:{access_token}@gitlab.{base_url}" return clone_url ================================================ FILE: pr_agent/git_providers/local_git_provider.py ================================================ from collections import Counter from pathlib import Path from typing import List from git import Repo from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from pr_agent.config_loader import _find_repository_root, get_settings from pr_agent.git_providers.git_provider import GitProvider from pr_agent.log import get_logger class PullRequestMimic: """ This class mimics the PullRequest class from the PyGithub library for the LocalGitProvider. """ def __init__(self, title: str, diff_files: List[FilePatchInfo]): self.title = title self.diff_files = diff_files class LocalGitProvider(GitProvider): """ This class implements the GitProvider interface for local git repositories. It mimics the PR functionality of the GitProvider interface, but does not require a hosted git repository. Instead of providing a PR url, the user provides a local branch path to generate a diff-patch. For the MVP it only supports the /review and /describe capabilities. """ def __init__(self, target_branch_name, incremental=False): self.repo_path = _find_repository_root() if self.repo_path is None: raise ValueError('Could not find repository root') self.repo = Repo(self.repo_path) self.head_branch_name = self.repo.head.ref.name self.target_branch_name = target_branch_name self._prepare_repo() self.diff_files = None self.pr = PullRequestMimic(self.get_pr_title(), self.get_diff_files()) self.description_path = get_settings().get('local.description_path') \ if get_settings().get('local.description_path') is not None else self.repo_path / 'description.md' self.review_path = get_settings().get('local.review_path') \ if get_settings().get('local.review_path') is not None else self.repo_path / 'review.md' # inline code comments are not supported for local git repositories get_settings().pr_reviewer.inline_code_comments = False def _prepare_repo(self): """ Prepare the repository for PR-mimic generation. """ get_logger().debug('Preparing repository for PR-mimic generation...') if self.repo.is_dirty(): raise ValueError('The repository is not in a clean state. Please commit or stash pending changes.') if self.target_branch_name not in self.repo.heads: raise KeyError(f'Branch: {self.target_branch_name} does not exist') def is_supported(self, capability: str) -> bool: if capability in ['get_issue_comments', 'create_inline_comment', 'publish_inline_comments', 'get_labels', 'gfm_markdown']: return False return True def get_diff_files(self) -> list[FilePatchInfo]: diffs = self.repo.head.commit.diff( self.repo.merge_base(self.repo.head, self.repo.branches[self.target_branch_name]), create_patch=True, R=True ) diff_files = [] for diff_item in diffs: if diff_item.a_blob is not None: original_file_content_str = diff_item.a_blob.data_stream.read().decode('utf-8') else: original_file_content_str = "" # empty file if diff_item.b_blob is not None: new_file_content_str = diff_item.b_blob.data_stream.read().decode('utf-8') else: new_file_content_str = "" # empty file edit_type = EDIT_TYPE.MODIFIED if diff_item.new_file: edit_type = EDIT_TYPE.ADDED elif diff_item.deleted_file: edit_type = EDIT_TYPE.DELETED elif diff_item.renamed_file: edit_type = EDIT_TYPE.RENAMED diff_files.append( FilePatchInfo(original_file_content_str, new_file_content_str, diff_item.diff.decode('utf-8'), diff_item.b_path, edit_type=edit_type, old_filename=None if diff_item.a_path == diff_item.b_path else diff_item.a_path ) ) self.diff_files = diff_files return diff_files def get_files(self) -> List[str]: """ Returns a list of files with changes in the diff. """ diff_index = self.repo.head.commit.diff( self.repo.merge_base(self.repo.head, self.repo.branches[self.target_branch_name]), R=True ) # Get the list of changed files diff_files = [item.a_path for item in diff_index] return diff_files def publish_description(self, pr_title: str, pr_body: str): with open(self.description_path, "w") as file: # Write the string to the file file.write(pr_title + '\n' + pr_body) def publish_comment(self, pr_comment: str, is_temporary: bool = False): with open(self.review_path, "w") as file: # Write the string to the file file.write(pr_comment) def publish_inline_comment(self, body: str, relevant_file: str, relevant_line_in_file: str, original_suggestion=None): raise NotImplementedError('Publishing inline comments is not implemented for the local git provider') def publish_inline_comments(self, comments: list[dict]): raise NotImplementedError('Publishing inline comments is not implemented for the local git provider') def publish_code_suggestion(self, body: str, relevant_file: str, relevant_lines_start: int, relevant_lines_end: int): raise NotImplementedError('Publishing code suggestions is not implemented for the local git provider') def publish_code_suggestions(self, code_suggestions: list) -> bool: raise NotImplementedError('Publishing code suggestions is not implemented for the local git provider') def publish_labels(self, labels): pass # Not applicable to the local git provider, but required by the interface def remove_initial_comment(self): pass # Not applicable to the local git provider, but required by the interface def remove_comment(self, comment): pass # Not applicable to the local git provider, but required by the interface def add_eyes_reaction(self, comment): pass # Not applicable to the local git provider, but required by the interface def get_commit_messages(self): pass # Not applicable to the local git provider, but required by the interface def get_repo_settings(self): pass # Not applicable to the local git provider, but required by the interface def remove_reaction(self, comment): pass # Not applicable to the local git provider, but required by the interface def get_languages(self): """ Calculate percentage of languages in repository. Used for hunk prioritisation. """ # Get all files in repository filepaths = [Path(item.path) for item in self.repo.tree().traverse() if item.type == 'blob'] # Identify language by file extension and count lang_count = Counter(ext.lstrip('.') for filepath in filepaths for ext in [filepath.suffix.lower()]) # Convert counts to percentages total_files = len(filepaths) lang_percentage = {lang: count / total_files * 100 for lang, count in lang_count.items()} return lang_percentage def get_pr_branch(self): return self.repo.head def get_user_id(self): return -1 # Not used anywhere for the local provider, but required by the interface def get_pr_description_full(self): commits_diff = list(self.repo.iter_commits(self.target_branch_name + '..HEAD')) # Get the commit messages and concatenate commit_messages = " ".join([commit.message for commit in commits_diff]) # TODO Handle the description better - maybe use gpt-3.5 summarisation here? return commit_messages[:200] # Use max 200 characters def get_pr_title(self): """ Substitutes the branch-name as the PR-mimic title. """ return self.head_branch_name def get_issue_comments(self): raise NotImplementedError('Getting issue comments is not implemented for the local git provider') def get_pr_labels(self, update=False): raise NotImplementedError('Getting labels is not implemented for the local git provider') ================================================ FILE: pr_agent/git_providers/utils.py ================================================ import copy import os import tempfile import traceback from dynaconf import Dynaconf from starlette_context import context from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider_with_context from pr_agent.log import get_logger def apply_repo_settings(pr_url): os.environ["AUTO_CAST_FOR_DYNACONF"] = "false" git_provider = get_git_provider_with_context(pr_url) if get_settings().config.use_repo_settings_file: repo_settings_file = None try: try: repo_settings = context.get("repo_settings", None) except Exception: repo_settings = None pass if repo_settings is None: # None is different from "", which is a valid value repo_settings = git_provider.get_repo_settings() try: context["repo_settings"] = repo_settings except Exception: pass error_local = None if repo_settings: repo_settings_file = None category = 'local' try: fd, repo_settings_file = tempfile.mkstemp(suffix='.toml') os.write(fd, repo_settings) try: dynconf_kwargs = {'core_loaders': [], # DISABLE default loaders, otherwise will load toml files more than once. 'loaders': ['pr_agent.custom_merge_loader'], # Use a custom loader to merge sections, but overwrite their overlapping values. Don't involve ENV variables. 'merge_enabled': True # Merge multiple files; ensures [XYZ] sections only overwrite overlapping keys, not whole sections. } new_settings = Dynaconf(settings_files=[repo_settings_file], # Disable all dynamic loading features load_dotenv=False, # Don't load .env files envvar_prefix=False, # Drop DYNACONF for env. variables **dynconf_kwargs ) except TypeError as e: # Fallback for older Dynaconf versions that don't support these parameters get_logger().warning( "Your Dynaconf version does not support disabled 'load_dotenv'/'merge_enabled' parameters. " "Loading repo settings without these security features. " "Please upgrade Dynaconf for better security.", artifact={"error": e, "traceback": traceback.format_exc()}) new_settings = Dynaconf(settings_files=[repo_settings_file]) for section, contents in new_settings.as_dict().items(): if not contents: # Skip excluded items, such as forbidden to load env. get_logger().debug(f"Skipping a section: {section} which is not allowed") continue section_dict = copy.deepcopy(get_settings().as_dict().get(section, {})) for key, value in contents.items(): section_dict[key] = value get_settings().unset(section) get_settings().set(section, section_dict, merge=False) get_logger().info(f"Applying repo settings:\n{new_settings.as_dict()}") except Exception as e: get_logger().warning(f"Failed to apply repo {category} settings, error: {str(e)}") error_local = {'error': str(e), 'settings': repo_settings, 'category': category} if error_local: handle_configurations_errors([error_local], git_provider) except Exception as e: get_logger().exception("Failed to apply repo settings", e) finally: if repo_settings_file: try: os.remove(repo_settings_file) except Exception as e: get_logger().error(f"Failed to remove temporary settings file {repo_settings_file}", e) # enable switching models with a short definition if get_settings().config.model.lower() == 'claude-3-5-sonnet': set_claude_model() def handle_configurations_errors(config_errors, git_provider): try: if not any(config_errors): return for err in config_errors: if err: configuration_file_content = err['settings'].decode() err_message = err['error'] config_type = err['category'] header = f"❌ **PR-Agent failed to apply '{config_type}' repo settings**" body = f"{header}\n\nThe configuration file needs to be a valid [TOML](https://qodo-merge-docs.qodo.ai/usage-guide/configuration_options/), please fix it.\n\n" body += f"___\n\n**Error message:**\n`{err_message}`\n\n" if git_provider.is_supported("gfm_markdown"): body += f"\n\n
      Configuration content:\n\n```toml\n{configuration_file_content}\n```\n\n
      " else: body += f"\n\n**Configuration content:**\n\n```toml\n{configuration_file_content}\n```\n\n" get_logger().warning(f"Sending a 'configuration error' comment to the PR", artifact={'body': body}) # git_provider.publish_comment(body) if hasattr(git_provider, 'publish_persistent_comment'): git_provider.publish_persistent_comment(body, initial_header=header, update_header=False, final_update_message=False) else: git_provider.publish_comment(body) except Exception as e: get_logger().exception(f"Failed to handle configurations errors", e) def set_claude_model(): """ set the claude-sonnet-3.5 model easily (even by users), just by stating: --config.model='claude-3-5-sonnet' """ model_claude = "bedrock/anthropic.claude-3-5-sonnet-20240620-v1:0" get_settings().set('config.model', model_claude) get_settings().set('config.model_weak', model_claude) get_settings().set('config.fallback_models', [model_claude]) ================================================ FILE: pr_agent/identity_providers/__init__.py ================================================ from pr_agent.config_loader import get_settings from pr_agent.identity_providers.default_identity_provider import \ DefaultIdentityProvider _IDENTITY_PROVIDERS = { 'default': DefaultIdentityProvider } def get_identity_provider(): identity_provider_id = get_settings().get("CONFIG.IDENTITY_PROVIDER", "default") if identity_provider_id not in _IDENTITY_PROVIDERS: raise ValueError(f"Unknown identity provider: {identity_provider_id}") return _IDENTITY_PROVIDERS[identity_provider_id]() ================================================ FILE: pr_agent/identity_providers/default_identity_provider.py ================================================ from pr_agent.identity_providers.identity_provider import (Eligibility, IdentityProvider) class DefaultIdentityProvider(IdentityProvider): def verify_eligibility(self, git_provider, git_provider_id, pr_url): return Eligibility.ELIGIBLE def inc_invocation_count(self, git_provider, git_provider_id): pass ================================================ FILE: pr_agent/identity_providers/identity_provider.py ================================================ from abc import ABC, abstractmethod from enum import Enum class Eligibility(Enum): NOT_ELIGIBLE = 0 ELIGIBLE = 1 TRIAL = 2 class IdentityProvider(ABC): @abstractmethod def verify_eligibility(self, git_provider, git_provider_id, pr_url): pass @abstractmethod def inc_invocation_count(self, git_provider, git_provider_id): pass ================================================ FILE: pr_agent/log/__init__.py ================================================ import os os.environ["AUTO_CAST_FOR_DYNACONF"] = "false" import json import logging import sys from enum import Enum from loguru import logger from pr_agent.config_loader import get_settings class LoggingFormat(str, Enum): CONSOLE = "CONSOLE" JSON = "JSON" def json_format(record: dict) -> str: return record["message"] def analytics_filter(record: dict) -> bool: return record.get("extra", {}).get("analytics", False) def inv_analytics_filter(record: dict) -> bool: return not record.get("extra", {}).get("analytics", False) def setup_logger(level: str = "INFO", fmt: LoggingFormat = LoggingFormat.CONSOLE): level: int = logging.getLevelName(level.upper()) if type(level) is not int: level = logging.INFO if fmt == LoggingFormat.JSON and os.getenv("LOG_SANE", "0").lower() == "0": # better debugging github_app logger.remove(None) logger.add( sys.stdout, filter=inv_analytics_filter, level=level, format="{message}", colorize=False, serialize=True, ) elif fmt == LoggingFormat.CONSOLE: # does not print the 'extra' fields logger.remove(None) logger.add(sys.stdout, level=level, colorize=True, filter=inv_analytics_filter) log_folder = get_settings().get("CONFIG.ANALYTICS_FOLDER", "") if log_folder: pid = os.getpid() log_file = os.path.join(log_folder, f"pr-agent.{pid}.log") logger.add( log_file, filter=analytics_filter, level=level, format="{message}", colorize=False, serialize=True, ) return logger def get_logger(*args, **kwargs): return logger ================================================ FILE: pr_agent/secret_providers/__init__.py ================================================ from pr_agent.config_loader import get_settings def get_secret_provider(): if not get_settings().get("CONFIG.SECRET_PROVIDER"): return None provider_id = get_settings().config.secret_provider if provider_id == 'google_cloud_storage': try: from pr_agent.secret_providers.google_cloud_storage_secret_provider import \ GoogleCloudStorageSecretProvider return GoogleCloudStorageSecretProvider() except Exception as e: raise ValueError(f"Failed to initialize google_cloud_storage secret provider {provider_id}") from e elif provider_id == 'aws_secrets_manager': try: from pr_agent.secret_providers.aws_secrets_manager_provider import \ AWSSecretsManagerProvider return AWSSecretsManagerProvider() except Exception as e: raise ValueError(f"Failed to initialize aws_secrets_manager secret provider {provider_id}") from e else: raise ValueError("Unknown SECRET_PROVIDER") ================================================ FILE: pr_agent/secret_providers/aws_secrets_manager_provider.py ================================================ import json import boto3 from botocore.exceptions import ClientError from pr_agent.config_loader import get_settings from pr_agent.log import get_logger from pr_agent.secret_providers.secret_provider import SecretProvider class AWSSecretsManagerProvider(SecretProvider): def __init__(self): try: region_name = get_settings().get("aws_secrets_manager.region_name") or \ get_settings().get("aws.AWS_REGION_NAME") if region_name: self.client = boto3.client('secretsmanager', region_name=region_name) else: self.client = boto3.client('secretsmanager') self.secret_arn = get_settings().get("aws_secrets_manager.secret_arn") if not self.secret_arn: raise ValueError("AWS Secrets Manager ARN is not configured") except Exception as e: get_logger().error(f"Failed to initialize AWS Secrets Manager Provider: {e}") raise e def get_secret(self, secret_name: str) -> str: """ Retrieve individual secret by name (for webhook tokens) """ try: response = self.client.get_secret_value(SecretId=secret_name) return response['SecretString'] except Exception as e: get_logger().warning(f"Failed to get secret {secret_name} from AWS Secrets Manager: {e}") return "" def get_all_secrets(self) -> dict: """ Retrieve all secrets for configuration override """ try: response = self.client.get_secret_value(SecretId=self.secret_arn) return json.loads(response['SecretString']) except Exception as e: get_logger().error(f"Failed to get secrets from AWS Secrets Manager {self.secret_arn}: {e}") return {} def store_secret(self, secret_name: str, secret_value: str): try: self.client.put_secret_value( SecretId=secret_name, SecretString=secret_value ) except Exception as e: get_logger().error(f"Failed to store secret {secret_name} in AWS Secrets Manager: {e}") raise e ================================================ FILE: pr_agent/secret_providers/google_cloud_storage_secret_provider.py ================================================ import ujson from google.cloud import storage from pr_agent.config_loader import get_settings from pr_agent.log import get_logger from pr_agent.secret_providers.secret_provider import SecretProvider class GoogleCloudStorageSecretProvider(SecretProvider): def __init__(self): try: self.client = storage.Client.from_service_account_info(ujson.loads(get_settings().google_cloud_storage. service_account)) self.bucket_name = get_settings().google_cloud_storage.bucket_name self.bucket = self.client.bucket(self.bucket_name) except Exception as e: get_logger().error(f"Failed to initialize Google Cloud Storage Secret Provider: {e}") raise e def get_secret(self, secret_name: str) -> str: try: blob = self.bucket.blob(secret_name) return blob.download_as_string() except Exception as e: get_logger().warning(f"Failed to get secret {secret_name} from Google Cloud Storage: {e}") return "" def store_secret(self, secret_name: str, secret_value: str): try: blob = self.bucket.blob(secret_name) blob.upload_from_string(secret_value) except Exception as e: get_logger().error(f"Failed to store secret {secret_name} in Google Cloud Storage: {e}") raise e ================================================ FILE: pr_agent/secret_providers/secret_provider.py ================================================ from abc import ABC, abstractmethod class SecretProvider(ABC): @abstractmethod def get_secret(self, secret_name: str) -> str: pass @abstractmethod def store_secret(self, secret_name: str, secret_value: str): pass ================================================ FILE: pr_agent/servers/__init__.py ================================================ ================================================ FILE: pr_agent/servers/atlassian-connect-qodo-merge.json ================================================ { "name": "Qodo Merge", "description": "Qodo Merge", "key": "app_key", "vendor": { "name": "Qodo", "url": "https://qodo.ai" }, "authentication": { "type": "jwt" }, "baseUrl": "base_url", "lifecycle": { "installed": "/installed", "uninstalled": "/uninstalled" }, "scopes": [ "account", "repository:write", "pullrequest:write", "wiki" ], "contexts": [ "account" ], "modules": { "webhooks": [ { "event": "*", "url": "/webhook" } ] }, "links": { "privacy": "https://qodo.ai/privacy-policy", "terms": "https://qodo.ai/terms" } } ================================================ FILE: pr_agent/servers/atlassian-connect.json ================================================ { "name": "CodiumAI PR-Agent", "description": "CodiumAI PR-Agent", "key": "app_key", "vendor": { "name": "CodiumAI", "url": "https://codium.ai" }, "authentication": { "type": "jwt" }, "baseUrl": "base_url", "lifecycle": { "installed": "/installed", "uninstalled": "/uninstalled" }, "scopes": [ "account", "repository:write", "pullrequest:write", "wiki" ], "contexts": [ "account" ], "modules": { "webhooks": [ { "event": "*", "url": "/webhook" } ] }, "links": { "privacy": "https://qodo.ai/privacy-policy", "terms": "https://qodo.ai/terms" } } ================================================ FILE: pr_agent/servers/azuredevops_server_webhook.py ================================================ # This file contains the code for the Azure DevOps Server webhook server. # The server listens for incoming webhooks from Azure DevOps Server and forwards them to the PR Agent. # ADO webhook documentation: https://learn.microsoft.com/en-us/azure/devops/service-hooks/services/webhooks?view=azure-devops import json import os import re import secrets from urllib.parse import unquote import uvicorn from fastapi import APIRouter, Depends, FastAPI, HTTPException, Request from fastapi.encoders import jsonable_encoder from fastapi.security import HTTPBasic, HTTPBasicCredentials from starlette import status from starlette.background import BackgroundTasks from starlette.middleware import Middleware from starlette.requests import Request from starlette.responses import JSONResponse from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent, command2class from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider_with_context from pr_agent.git_providers.azuredevops_provider import AzureDevopsProvider from pr_agent.git_providers.utils import apply_repo_settings from pr_agent.log import LoggingFormat, get_logger, setup_logger setup_logger(fmt=LoggingFormat.JSON, level=get_settings().get("CONFIG.LOG_LEVEL", "DEBUG")) security = HTTPBasic(auto_error=False) router = APIRouter() available_commands_rgx = re.compile(r"^\/(" + "|".join(command2class.keys()) + r")\s*") azure_devops_server = get_settings().get("azure_devops_server") WEBHOOK_USERNAME = azure_devops_server.get("webhook_username", None) WEBHOOK_PASSWORD = azure_devops_server.get("webhook_password", None) async def handle_request_comment(url: str, body: str, thread_id: int, comment_id: int, log_context: dict): log_context["action"] = body log_context["api_url"] = url try: with get_logger().contextualize(**log_context): agent = PRAgent() provider = get_git_provider_with_context(pr_url=url) body = handle_line_comment(body, thread_id, provider) handled = await agent.handle_request(url, body, notify=lambda: provider.reply_to_thread(thread_id, "On it! ⏳", True)) # mark command comment as closed if handled: provider.set_thread_status(thread_id, "closed") provider.remove_initial_comment() except Exception as e: get_logger().exception(f"Failed to handle webhook", artifact={"url": url, "body": body}, error=str(e)) def handle_line_comment(body: str, thread_id: int, provider: AzureDevopsProvider): body = body.strip() if not body.startswith('/ask '): return body thread_context = provider.get_thread_context(thread_id) if not thread_context: return body path = thread_context.file_path if thread_context.left_file_end or thread_context.left_file_start: start_line = thread_context.left_file_start.line end_line = thread_context.left_file_end.line side = "left" elif thread_context.right_file_end or thread_context.right_file_start: start_line = thread_context.right_file_start.line end_line = thread_context.right_file_end.line side = "right" else: get_logger().info("No line range found in thread context", artifact={"thread_context": thread_context}) return body question = body[5:].lstrip() # remove 4 chars: '/ask ' return f"/ask_line --line_start={start_line} --line_end={end_line} --side={side} --file_name={path} --comment_id={thread_id} {question}" # currently only basic auth is supported with azure webhooks # for this reason, https must be enabled to ensure the credentials are not sent in clear text def authorize(credentials: HTTPBasicCredentials = Depends(security)): if WEBHOOK_USERNAME is None or WEBHOOK_PASSWORD is None: return is_user_ok = secrets.compare_digest(credentials.username, WEBHOOK_USERNAME) is_pass_ok = secrets.compare_digest(credentials.password, WEBHOOK_PASSWORD) if not (is_user_ok and is_pass_ok): raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail='Incorrect username or password.', headers={'WWW-Authenticate': 'Basic'}, ) async def _perform_commands_azure(commands_conf: str, agent: PRAgent, api_url: str, log_context: dict): apply_repo_settings(api_url) if commands_conf == "pr_commands" and get_settings().config.disable_auto_feedback: # auto commands for PR, and auto feedback is disabled get_logger().info(f"Auto feedback is disabled, skipping auto commands for PR {api_url=}", **log_context) return commands = get_settings().get(f"azure_devops_server.{commands_conf}") if not commands: return get_settings().set("config.is_auto_command", True) for command in commands: try: split_command = command.split(" ") command = split_command[0] args = split_command[1:] other_args = update_settings_from_args(args) new_command = ' '.join([command] + other_args) get_logger().info(f"Performing command: {new_command}") with get_logger().contextualize(**log_context): await agent.handle_request(api_url, new_command) except Exception as e: get_logger().error(f"Failed to perform command {command}: {e}") async def handle_request_azure(data, log_context): if data["eventType"] == "git.pullrequest.created": # API V1 (latest) pr_url = unquote(data["resource"]["_links"]["web"]["href"].replace("_apis/git/repositories", "_git")) log_context["event"] = data["eventType"] log_context["api_url"] = pr_url await _perform_commands_azure("pr_commands", PRAgent(), pr_url, log_context) return JSONResponse( status_code=status.HTTP_202_ACCEPTED, content=jsonable_encoder({"message": "webhook triggered successfully"}) ) elif data["eventType"] == "ms.vss-code.git-pullrequest-comment-event" and "content" in data["resource"]["comment"]: comment = data["resource"]["comment"] if available_commands_rgx.match(comment["content"]): if(data["resourceVersion"] == "2.0"): repo = data["resource"]["pullRequest"]["repository"]["webUrl"] pr_url = unquote(f'{repo}/pullrequest/{data["resource"]["pullRequest"]["pullRequestId"]}') action = comment["content"] thread_url = comment["_links"]["threads"]["href"] thread_id = int(thread_url.split("/")[-1]) comment_id = int(comment["id"]) pass else: # API V1 not supported as it does not contain the PR URL return JSONResponse( status_code=status.HTTP_400_BAD_REQUEST, content=json.dumps({"message": "version 1.0 webhook for Azure Devops PR comment is not supported. please upgrade to version 2.0"})), else: return JSONResponse( status_code=status.HTTP_400_BAD_REQUEST, content=json.dumps({"message": "Unsupported command"}), ) else: return JSONResponse( status_code=status.HTTP_204_NO_CONTENT, content=json.dumps({"message": "Unsupported event"}), ) log_context["event"] = data["eventType"] log_context["api_url"] = pr_url try: await handle_request_comment(pr_url, action, thread_id, comment_id, log_context) except Exception as e: get_logger().error("Azure DevOps Trigger failed. Error:" + str(e)) return JSONResponse( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, content=json.dumps({"message": "Internal server error"}), ) return JSONResponse( status_code=status.HTTP_202_ACCEPTED, content=jsonable_encoder({"message": "webhook triggered successfully"}) ) @router.post("/", dependencies=[Depends(authorize)]) async def handle_webhook(background_tasks: BackgroundTasks, request: Request): log_context = {"server_type": "azure_devops_server"} data = await request.json() # get_logger().info(json.dumps(data)) background_tasks.add_task(handle_request_azure, data, log_context) return JSONResponse( status_code=status.HTTP_202_ACCEPTED, content=jsonable_encoder({"message": "webhook triggered successfully"}) ) @router.get("/") async def root(): return {"status": "ok"} def start(): app = FastAPI(middleware=[Middleware(RawContextMiddleware)]) app.include_router(router) uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "3000"))) if __name__ == "__main__": start() ================================================ FILE: pr_agent/servers/bitbucket_app.py ================================================ import base64 import copy import hashlib import json import os import re import time import jwt import requests import uvicorn from fastapi import APIRouter, FastAPI, Request, Response from starlette.background import BackgroundTasks from starlette.middleware import Middleware from starlette.responses import JSONResponse from starlette_context import context from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings, global_settings from pr_agent.git_providers.utils import apply_repo_settings from pr_agent.identity_providers import get_identity_provider from pr_agent.identity_providers.identity_provider import Eligibility from pr_agent.log import LoggingFormat, get_logger, setup_logger from pr_agent.secret_providers import get_secret_provider setup_logger(fmt=LoggingFormat.JSON, level=get_settings().get("CONFIG.LOG_LEVEL", "DEBUG")) router = APIRouter() secret_provider = get_secret_provider() if get_settings().get("CONFIG.SECRET_PROVIDER") else None async def get_bearer_token(shared_secret: str, client_key: str): try: now = int(time.time()) url = "https://bitbucket.org/site/oauth2/access_token" canonical_url = "GET&/site/oauth2/access_token&" qsh = hashlib.sha256(canonical_url.encode("utf-8")).hexdigest() app_key = get_settings().bitbucket.app_key payload = { "iss": app_key, "iat": now, "exp": now + 240, "qsh": qsh, "sub": client_key, } token = jwt.encode(payload, shared_secret, algorithm="HS256") payload = 'grant_type=urn%3Abitbucket%3Aoauth2%3Ajwt' headers = { 'Authorization': f'JWT {token}', 'Content-Type': 'application/x-www-form-urlencoded' } response = requests.request("POST", url, headers=headers, data=payload) bearer_token = response.json()["access_token"] return bearer_token except Exception as e: get_logger().error(f"Failed to get bearer token: {e}") raise e @router.get("/") async def handle_manifest(request: Request, response: Response): cur_dir = os.path.dirname(os.path.abspath(__file__)) manifest = open(os.path.join(cur_dir, "atlassian-connect.json"), "rt").read() try: manifest = manifest.replace("app_key", get_settings().bitbucket.app_key) manifest = manifest.replace("base_url", get_settings().bitbucket.base_url) except: get_logger().error("Failed to replace api_key in Bitbucket manifest, trying to continue") manifest_obj = json.loads(manifest) return JSONResponse(manifest_obj) def _get_username(data): actor = data.get("data", {}).get("actor", {}) if actor: if "username" in actor: return actor["username"] elif "display_name" in actor: return actor["display_name"] elif "nickname" in actor: return actor["nickname"] return "" async def _validate_time_from_last_commit_to_pr_update(data: dict) -> bool: is_valid_push = False try: data_inner = data.get('data', {}) if not data_inner: get_logger().error("No data found in the webhook payload") return True pull_request = data_inner.get('pullrequest', {}) commits_api = pull_request.get('links', {}).get('commits', {}).get('href') if not commits_api: return False if not pull_request.get('updated_on'): return False bearer_token = context.get('bitbucket_bearer_token') headers = { 'Authorization': f'Bearer {bearer_token}', 'Accept': 'application/json' } response = requests.get(commits_api, headers=headers) if response.status_code != 200: get_logger().warning(f"Bitbucket commits API returned {response.status_code} for {commits_api}") return False username =_get_username(data) commits_data = response.json() or {} values = commits_data.get('values') or [] if (not values or not isinstance(values, list) or not values[0].get('author') or not values[0]['author'].get('user') or not values[0]['author']['user'].get('display_name')): get_logger().warning("No commits returned for pull request or one of the required fields missing; skipping push validation", artifact={'values': values}) return False commit_username = commits_data['values'][0]['author']['user']['display_name'] if username != commit_username: get_logger().warning(f"Mismatch in username {username} vs. commit_username {commit_username}") return False time_pr_updated = pull_request['updated_on'] time_last_commit = commits_data['values'][0]['date'] from datetime import datetime ts1 = datetime.fromisoformat(time_pr_updated) ts2 = datetime.fromisoformat(time_last_commit) diff = (ts1 - ts2).total_seconds() max_delta_seconds = 15 if diff > 0 and diff < max_delta_seconds: is_valid_push = True else: get_logger().debug(f"Too much time passed since last commit", artifact={'updated': time_pr_updated, 'last_commit': time_last_commit}) except Exception as e: get_logger().exception(f"Failed to validate time difference between last commit and PR update", artifact={'error': e, 'data': data}) return is_valid_push async def _perform_commands_bitbucket(commands_conf: str, agent: PRAgent, api_url: str, log_context: dict, data: dict): apply_repo_settings(api_url) if commands_conf == "pr_commands" and get_settings().config.disable_auto_feedback: # auto commands for PR, and auto feedback is disabled get_logger().info(f"Auto feedback is disabled, skipping auto commands for PR {api_url=}") return if commands_conf == "push_commands": if not get_settings().get("bitbucket_app.handle_push_trigger"): get_logger().info( "Bitbucket push trigger handling disabled via config; skipping push commands") return if data.get("event", "") == "pullrequest:created": if not should_process_pr_logic(data): return commands = get_settings().get(f"bitbucket_app.{commands_conf}", {}) get_settings().set("config.is_auto_command", True) if commands_conf == "push_commands": is_valid_push = await _validate_time_from_last_commit_to_pr_update(data) if not is_valid_push: get_logger().info(f"Bitbucket skipping 'pullrequest:updated' for push commands") return for command in commands: try: split_command = command.split(" ") command = split_command[0] args = split_command[1:] other_args = update_settings_from_args(args) new_command = ' '.join([command] + other_args) get_logger().info(f"Performing command: {new_command}") with get_logger().contextualize(**log_context): await agent.handle_request(api_url, new_command) except Exception as e: get_logger().error(f"Failed to perform command {command}: {e}") def is_bot_user(data) -> bool: try: actor = data.get("data", {}).get("actor", {}) # allow actor type: user . if it's "AppUser" or "team" then it is a bot user allowed_actor_types = {"user"} if actor and actor["type"].lower() not in allowed_actor_types: get_logger().info(f"BitBucket actor type is not 'user', skipping: {actor}") return True except Exception as e: get_logger().error(f"Failed 'is_bot_user' logic: {e}") return False def should_process_pr_logic(data) -> bool: try: pr_data = data.get("data", {}).get("pullrequest", {}) title = pr_data.get("title", "") source_branch = pr_data.get("source", {}).get("branch", {}).get("name", "") target_branch = pr_data.get("destination", {}).get("branch", {}).get("name", "") sender = _get_username(data) repo_full_name = pr_data.get("destination", {}).get("repository", {}).get("full_name", "") # logic to ignore PRs from specific repositories ignore_repos = get_settings().get("CONFIG.IGNORE_REPOSITORIES", []) if repo_full_name and ignore_repos: if any(re.search(regex, repo_full_name) for regex in ignore_repos): get_logger().info(f"Ignoring PR from repository '{repo_full_name}' due to 'config.ignore_repositories' setting") return False # logic to ignore PRs from specific users ignore_pr_users = get_settings().get("CONFIG.IGNORE_PR_AUTHORS", []) if ignore_pr_users and sender: if any(re.search(regex, sender) for regex in ignore_pr_users): get_logger().info(f"Ignoring PR from user '{sender}' due to 'config.ignore_pr_authors' setting") return False # logic to ignore PRs with specific titles if title: ignore_pr_title_re = get_settings().get("CONFIG.IGNORE_PR_TITLE", []) if not isinstance(ignore_pr_title_re, list): ignore_pr_title_re = [ignore_pr_title_re] if ignore_pr_title_re and any(re.search(regex, title) for regex in ignore_pr_title_re): get_logger().info(f"Ignoring PR with title '{title}' due to config.ignore_pr_title setting") return False ignore_pr_source_branches = get_settings().get("CONFIG.IGNORE_PR_SOURCE_BRANCHES", []) ignore_pr_target_branches = get_settings().get("CONFIG.IGNORE_PR_TARGET_BRANCHES", []) if (ignore_pr_source_branches or ignore_pr_target_branches): if any(re.search(regex, source_branch) for regex in ignore_pr_source_branches): get_logger().info( f"Ignoring PR with source branch '{source_branch}' due to config.ignore_pr_source_branches settings") return False if any(re.search(regex, target_branch) for regex in ignore_pr_target_branches): get_logger().info( f"Ignoring PR with target branch '{target_branch}' due to config.ignore_pr_target_branches settings") return False except Exception as e: get_logger().error(f"Failed 'should_process_pr_logic': {e}") return True @router.post("/webhook") async def handle_github_webhooks(background_tasks: BackgroundTasks, request: Request): app_name = get_settings().get("CONFIG.APP_NAME", "Unknown") log_context = {"server_type": "bitbucket_app", "app_name": app_name} get_logger().debug(request.headers) jwt_header = request.headers.get("authorization", None) if jwt_header: input_jwt = jwt_header.split(" ")[1] data = await request.json() get_logger().debug(data) async def inner(): try: # ignore bot users if is_bot_user(data): return "OK" # Check if the PR should be processed if data.get("event", "") == "pullrequest:created": if not should_process_pr_logic(data): return "OK" # Get the username of the sender log_context["sender"] = _get_username(data) sender_id = data.get("data", {}).get("actor", {}).get("account_id", "") log_context["sender_id"] = sender_id jwt_parts = input_jwt.split(".") claim_part = jwt_parts[1] claim_part += "=" * (-len(claim_part) % 4) decoded_claims = base64.urlsafe_b64decode(claim_part) claims = json.loads(decoded_claims) client_key = claims["iss"] secrets = json.loads(secret_provider.get_secret(client_key)) shared_secret = secrets["shared_secret"] jwt.decode(input_jwt, shared_secret, audience=client_key, algorithms=["HS256"]) bearer_token = await get_bearer_token(shared_secret, client_key) context['bitbucket_bearer_token'] = bearer_token context["settings"] = copy.deepcopy(global_settings) event = data["event"] agent = PRAgent() if event == "pullrequest:created": pr_url = data["data"]["pullrequest"]["links"]["html"]["href"] log_context["api_url"] = pr_url log_context["event"] = "pull_request" if pr_url: with get_logger().contextualize(**log_context): if get_identity_provider().verify_eligibility("bitbucket", sender_id, pr_url) is not Eligibility.NOT_ELIGIBLE: if get_settings().get("bitbucket_app.pr_commands"): await _perform_commands_bitbucket("pr_commands", agent, pr_url, log_context, data) elif event == "pullrequest:updated": # PR updated, might be from a push (we will validate this later) pr_url = data["data"]["pullrequest"]["links"]["html"]["href"] log_context["api_url"] = pr_url log_context["event"] = "pull_request" if pr_url: with get_logger().contextualize(**log_context): if get_identity_provider().verify_eligibility("bitbucket", sender_id, pr_url) is not Eligibility.NOT_ELIGIBLE: if get_settings().get("bitbucket_app.push_commands"): await _perform_commands_bitbucket("push_commands", agent, pr_url, log_context, data) elif event == "pullrequest:comment_created": pr_url = data["data"]["pullrequest"]["links"]["html"]["href"] log_context["api_url"] = pr_url log_context["event"] = "comment" comment_body = data["data"]["comment"]["content"]["raw"] with get_logger().contextualize(**log_context): if get_identity_provider().verify_eligibility("bitbucket", sender_id, pr_url) is not Eligibility.NOT_ELIGIBLE: await agent.handle_request(pr_url, comment_body) except Exception as e: get_logger().error(f"Failed to handle webhook: {e}") background_tasks.add_task(inner) return "OK" @router.get("/webhook") async def handle_github_webhooks(request: Request, response: Response): return "Webhook server online!" @router.post("/installed") async def handle_installed_webhooks(request: Request, response: Response): try: get_logger().info("handle_installed_webhooks") get_logger().info(request.headers) data = await request.json() get_logger().info(data) shared_secret = data["sharedSecret"] client_key = data["clientKey"] username = data["principal"]["username"] secrets = { "shared_secret": shared_secret, "client_key": client_key } secret_provider.store_secret(username, json.dumps(secrets)) except Exception as e: get_logger().error(f"Failed to register user: {e}") return JSONResponse({"error": "Unable to register user"}, status_code=500) @router.post("/uninstalled") async def handle_uninstalled_webhooks(request: Request, response: Response): get_logger().info("handle_uninstalled_webhooks") data = await request.json() get_logger().info(data) def start(): get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) get_settings().set("CONFIG.GIT_PROVIDER", "bitbucket") get_settings().set("PR_DESCRIPTION.PUBLISH_DESCRIPTION_AS_COMMENT", True) middleware = [Middleware(RawContextMiddleware)] app = FastAPI(middleware=middleware) app.include_router(router) uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "3000"))) if __name__ == '__main__': start() ================================================ FILE: pr_agent/servers/bitbucket_server_webhook.py ================================================ import ast import json import os import re from typing import List import uvicorn from fastapi import APIRouter, FastAPI from fastapi.encoders import jsonable_encoder from fastapi.responses import RedirectResponse from starlette import status from starlette.background import BackgroundTasks from starlette.middleware import Middleware from starlette.requests import Request from starlette.responses import JSONResponse from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings from pr_agent.git_providers.utils import apply_repo_settings from pr_agent.log import LoggingFormat, get_logger, setup_logger from pr_agent.servers.utils import verify_signature setup_logger(fmt=LoggingFormat.JSON, level=get_settings().get("CONFIG.LOG_LEVEL", "DEBUG")) router = APIRouter() def handle_request( background_tasks: BackgroundTasks, url: str, body: str, log_context: dict ): log_context["action"] = body log_context["api_url"] = url async def inner(): try: with get_logger().contextualize(**log_context): await PRAgent().handle_request(url, body) except Exception as e: get_logger().error(f"Failed to handle webhook: {e}") background_tasks.add_task(inner) def should_process_pr_logic(data) -> bool: try: pr_data = data.get("pullRequest", {}) title = pr_data.get("title", "") from_ref = pr_data.get("fromRef", {}) source_branch = from_ref.get("displayId", "") if from_ref else "" to_ref = pr_data.get("toRef", {}) target_branch = to_ref.get("displayId", "") if to_ref else "" author = pr_data.get("author", {}) user = author.get("user", {}) if author else {} sender = user.get("name", "") if user else "" repository = to_ref.get("repository", {}) if to_ref else {} project = repository.get("project", {}) if repository else {} project_key = project.get("key", "") if project else "" repo_slug = repository.get("slug", "") if repository else "" repo_full_name = f"{project_key}/{repo_slug}" if project_key and repo_slug else "" pr_id = pr_data.get("id", None) # To ignore PRs from specific repositories ignore_repos = get_settings().get("CONFIG.IGNORE_REPOSITORIES", []) if repo_full_name and ignore_repos: if any(re.search(regex, repo_full_name) for regex in ignore_repos): get_logger().info(f"Ignoring PR from repository '{repo_full_name}' due to 'config.ignore_repositories' setting") return False # To ignore PRs from specific users ignore_pr_users = get_settings().get("CONFIG.IGNORE_PR_AUTHORS", []) if ignore_pr_users and sender: if any(re.search(regex, sender) for regex in ignore_pr_users): get_logger().info(f"Ignoring PR from user '{sender}' due to 'config.ignore_pr_authors' setting") return False # To ignore PRs with specific titles if title: ignore_pr_title_re = get_settings().get("CONFIG.IGNORE_PR_TITLE", []) if not isinstance(ignore_pr_title_re, list): ignore_pr_title_re = [ignore_pr_title_re] if ignore_pr_title_re and any(re.search(regex, title) for regex in ignore_pr_title_re): get_logger().info(f"Ignoring PR with title '{title}' due to config.ignore_pr_title setting") return False ignore_pr_source_branches = get_settings().get("CONFIG.IGNORE_PR_SOURCE_BRANCHES", []) ignore_pr_target_branches = get_settings().get("CONFIG.IGNORE_PR_TARGET_BRANCHES", []) if (ignore_pr_source_branches or ignore_pr_target_branches): if any(re.search(regex, source_branch) for regex in ignore_pr_source_branches): get_logger().info( f"Ignoring PR with source branch '{source_branch}' due to config.ignore_pr_source_branches settings") return False if any(re.search(regex, target_branch) for regex in ignore_pr_target_branches): get_logger().info( f"Ignoring PR with target branch '{target_branch}' due to config.ignore_pr_target_branches settings") return False # Allow_only_specific_folders allowed_folders = get_settings().config.get("allow_only_specific_folders", []) if allowed_folders and pr_id and project_key and repo_slug: from pr_agent.git_providers.bitbucket_server_provider import BitbucketServerProvider bitbucket_server_url = get_settings().get("BITBUCKET_SERVER.URL", "") pr_url = f"{bitbucket_server_url}/projects/{project_key}/repos/{repo_slug}/pull-requests/{pr_id}" provider = BitbucketServerProvider(pr_url=pr_url) changed_files = provider.get_files() if changed_files: # Check if ALL files are outside allowed folders all_files_outside = True for file_path in changed_files: if any(file_path.startswith(folder) for folder in allowed_folders): all_files_outside = False break if all_files_outside: get_logger().info(f"Ignoring PR because all files {changed_files} are outside allowed folders {allowed_folders}") return False except Exception as e: get_logger().error(f"Failed 'should_process_pr_logic': {e}") return True # On exception - we continue. Otherwise, we could just end up with filtering all PRs return True @router.post("/") async def redirect_to_webhook(): return RedirectResponse(url="/webhook") @router.post("/webhook") async def handle_webhook(background_tasks: BackgroundTasks, request: Request): log_context = {"server_type": "bitbucket_server"} data = await request.json() get_logger().info(json.dumps(data)) webhook_secret = get_settings().get("BITBUCKET_SERVER.WEBHOOK_SECRET", None) if webhook_secret: body_bytes = await request.body() if body_bytes.decode('utf-8') == '{"test": true}': return JSONResponse( status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "connection test successful"}) ) signature_header = request.headers.get("x-hub-signature", None) verify_signature(body_bytes, webhook_secret, signature_header) pr_id = data["pullRequest"]["id"] repository_name = data["pullRequest"]["toRef"]["repository"]["slug"] project_name = data["pullRequest"]["toRef"]["repository"]["project"]["key"] bitbucket_server = get_settings().get("BITBUCKET_SERVER.URL") pr_url = f"{bitbucket_server}/projects/{project_name}/repos/{repository_name}/pull-requests/{pr_id}" log_context["api_url"] = pr_url log_context["event"] = "pull_request" commands_to_run = [] if (data["eventKey"] == "pr:opened" or (data["eventKey"] == "repo:refs_changed" and data.get("pullRequest", {}).get("id", -1) != -1)): # push event; -1 for push unassigned to a PR: #Check auto commands for creation/updating apply_repo_settings(pr_url) if not should_process_pr_logic(data): get_logger().info(f"PR ignored due to config settings", **log_context) return JSONResponse( status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "PR ignored by config"}) ) if get_settings().config.disable_auto_feedback: # auto commands for PR, and auto feedback is disabled get_logger().info(f"Auto feedback is disabled, skipping auto commands for PR {pr_url}", **log_context) return JSONResponse( status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "PR ignored due to auto feedback not enabled"}) ) get_settings().set("config.is_auto_command", True) if data["eventKey"] == "pr:opened": commands_to_run.extend(_get_commands_list_from_settings('BITBUCKET_SERVER.PR_COMMANDS')) else: #Has to be: data["eventKey"] == "pr:from_ref_updated" if not get_settings().get("BITBUCKET_SERVER.HANDLE_PUSH_TRIGGER"): get_logger().info(f"Push trigger is disabled, skipping push commands for PR {pr_url}", **log_context) return JSONResponse( status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "PR ignored due to push trigger not enabled"}) ) get_settings().set("config.is_new_pr", False) commands_to_run.extend(_get_commands_list_from_settings('BITBUCKET_SERVER.PUSH_COMMANDS')) elif data["eventKey"] == "pr:comment:added": commands_to_run.append(data["comment"]["text"]) else: return JSONResponse( status_code=status.HTTP_400_BAD_REQUEST, content=json.dumps({"message": "Unsupported event"}), ) async def inner(): try: await _run_commands_sequentially(commands_to_run, pr_url, log_context) except Exception as e: get_logger().error(f"Failed to handle webhook: {e}") background_tasks.add_task(inner) return JSONResponse( status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"}) ) async def _run_commands_sequentially(commands: List[str], url: str, log_context: dict): get_logger().info(f"Running commands sequentially: {commands}") if commands is None: return for command in commands: try: body = _process_command(command, url) log_context["action"] = body log_context["api_url"] = url with get_logger().contextualize(**log_context): await PRAgent().handle_request(url, body) except Exception as e: get_logger().error(f"Failed to handle command: {command} , error: {e}") def _process_command(command: str, url) -> str: # don't think we need this apply_repo_settings(url) # Process the command string split_command = command.split(" ") command = split_command[0] args = split_command[1:] # do I need this? if yes, shouldn't this be done in PRAgent? other_args = update_settings_from_args(args) new_command = ' '.join([command] + other_args) return new_command def _to_list(command_string: str) -> list: try: # Use ast.literal_eval to safely parse the string into a list commands = ast.literal_eval(command_string) # Check if the parsed object is a list of strings if isinstance(commands, list) and all(isinstance(cmd, str) for cmd in commands): return commands else: raise ValueError("Parsed data is not a list of strings.") except (SyntaxError, ValueError, TypeError) as e: raise ValueError(f"Invalid command string: {e}") def _get_commands_list_from_settings(setting_key:str ) -> list: try: return get_settings().get(setting_key, []) except ValueError as e: get_logger().error(f"Failed to get commands list from settings {setting_key}: {e}") @router.get("/") async def root(): return {"status": "ok"} def start(): app = FastAPI(middleware=[Middleware(RawContextMiddleware)]) app.include_router(router) uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "3000"))) if __name__ == "__main__": start() ================================================ FILE: pr_agent/servers/gerrit_server.py ================================================ import copy from enum import Enum from json import JSONDecodeError import uvicorn from fastapi import APIRouter, FastAPI, HTTPException from pydantic import BaseModel from starlette.middleware import Middleware from starlette_context import context from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent from pr_agent.config_loader import get_settings, global_settings from pr_agent.log import get_logger, setup_logger setup_logger() router = APIRouter() class Action(str, Enum): review = "review" describe = "describe" ask = "ask" improve = "improve" reflect = "reflect" answer = "answer" class Item(BaseModel): refspec: str project: str msg: str @router.post("/api/v1/gerrit/{action}") async def handle_gerrit_request(action: Action, item: Item): get_logger().debug("Received a Gerrit request") context["settings"] = copy.deepcopy(global_settings) if action == Action.ask: if not item.msg: return HTTPException( status_code=400, detail="msg is required for ask command" ) await PRAgent().handle_request( f"{item.project}:{item.refspec}", f"/{item.msg.strip()}" ) async def get_body(request): try: body = await request.json() except JSONDecodeError as e: get_logger().error("Error parsing request body", e) return {} return body @router.get("/") async def root(): return {"status": "ok"} def start(): # to prevent adding help messages with the output get_settings().set("CONFIG.CLI_MODE", True) middleware = [Middleware(RawContextMiddleware)] app = FastAPI(middleware=middleware) app.include_router(router) uvicorn.run(app, host="0.0.0.0", port=3000) if __name__ == '__main__': start() ================================================ FILE: pr_agent/servers/gitea_app.py ================================================ import copy import os import re from typing import Any, Dict from fastapi import APIRouter, FastAPI, HTTPException, Request, Response from starlette.background import BackgroundTasks from starlette.middleware import Middleware from starlette_context import context from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings, global_settings from pr_agent.git_providers.utils import apply_repo_settings from pr_agent.log import LoggingFormat, get_logger, setup_logger from pr_agent.servers.utils import verify_signature # Setup logging and router setup_logger(fmt=LoggingFormat.JSON, level=get_settings().get("CONFIG.LOG_LEVEL", "DEBUG")) router = APIRouter() @router.post("/api/v1/gitea_webhooks") async def handle_gitea_webhooks(background_tasks: BackgroundTasks, request: Request, response: Response): """Handle incoming Gitea webhook requests""" get_logger().debug("Received a Gitea webhook") body = await get_body(request) # Set context for the request context["settings"] = copy.deepcopy(global_settings) context["git_provider"] = {} # Handle the webhook in background background_tasks.add_task(handle_request, body, event=request.headers.get("X-Gitea-Event", None)) return {} async def get_body(request: Request): """Parse and verify webhook request body""" try: body = await request.json() except Exception as e: get_logger().error("Error parsing request body", artifact={'error': e}) raise HTTPException(status_code=400, detail="Error parsing request body") from e # Verify webhook signature webhook_secret = getattr(get_settings().gitea, 'webhook_secret', None) if webhook_secret: body_bytes = await request.body() signature_header = request.headers.get('x-gitea-signature', None) if not signature_header: get_logger().error("Missing signature header") raise HTTPException(status_code=400, detail="Missing signature header") try: verify_signature(body_bytes, webhook_secret, f"sha256={signature_header}") except Exception as ex: get_logger().error(f"Invalid signature: {ex}") raise HTTPException(status_code=401, detail="Invalid signature") return body async def handle_request(body: Dict[str, Any], event: str): """Process Gitea webhook events""" action = body.get("action") if not action: get_logger().debug("No action found in request body") return {} agent = PRAgent() # Handle different event types if event == "pull_request": if not should_process_pr_logic(body): get_logger().debug(f"Request ignored: PR logic filtering") return {} if action in ["opened", "reopened", "synchronized"]: await handle_pr_event(body, event, action, agent) elif event == "issue_comment": if action == "created": await handle_comment_event(body, event, action, agent) return {} async def handle_pr_event(body: Dict[str, Any], event: str, action: str, agent: PRAgent): """Handle pull request events""" pr = body.get("pull_request", {}) if not pr: return api_url = pr.get("url") if not api_url: return # Handle PR based on action if action in ["opened", "reopened"]: # commands = get_settings().get("gitea.pr_commands", []) await _perform_commands_gitea("pr_commands", agent, body, api_url) # for command in commands: # await agent.handle_request(api_url, command) elif action == "synchronized": # Handle push to PR commands_on_push = get_settings().get(f"gitea.push_commands", {}) handle_push_trigger = get_settings().get(f"gitea.handle_push_trigger", False) if not commands_on_push or not handle_push_trigger: get_logger().info("Push event, but no push commands found or push trigger is disabled") return get_logger().debug(f'A push event has been received: {api_url}') await _perform_commands_gitea("push_commands", agent, body, api_url) # for command in commands_on_push: # await agent.handle_request(api_url, command) async def handle_comment_event(body: Dict[str, Any], event: str, action: str, agent: PRAgent): """Handle comment events""" comment = body.get("comment", {}) if not comment: return comment_body = comment.get("body", "") if not comment_body or not comment_body.startswith("/"): return pr_url = body.get("pull_request", {}).get("url") if not pr_url: return await agent.handle_request(pr_url, comment_body) async def _perform_commands_gitea(commands_conf: str, agent: PRAgent, body: dict, api_url: str): apply_repo_settings(api_url) if commands_conf == "pr_commands" and get_settings().config.disable_auto_feedback: # auto commands for PR, and auto feedback is disabled get_logger().info(f"Auto feedback is disabled, skipping auto commands for PR {api_url=}") return if not should_process_pr_logic(body): # Here we already updated the configuration with the repo settings return {} commands = get_settings().get(f"gitea.{commands_conf}") if not commands: get_logger().info(f"New PR, but no auto commands configured") return get_settings().set("config.is_auto_command", True) for command in commands: split_command = command.split(" ") command = split_command[0] args = split_command[1:] other_args = update_settings_from_args(args) new_command = ' '.join([command] + other_args) get_logger().info(f"{commands_conf}. Performing auto command '{new_command}', for {api_url=}") await agent.handle_request(api_url, new_command) def should_process_pr_logic(body) -> bool: try: pull_request = body.get("pull_request", {}) title = pull_request.get("title", "") pr_labels = pull_request.get("labels", []) source_branch = pull_request.get("head", {}).get("ref", "") target_branch = pull_request.get("base", {}).get("ref", "") sender = body.get("sender", {}).get("login") repo_full_name = body.get("repository", {}).get("full_name", "") # logic to ignore PRs from specific repositories ignore_repos = get_settings().get("CONFIG.IGNORE_REPOSITORIES", []) if ignore_repos and repo_full_name: if any(re.search(regex, repo_full_name) for regex in ignore_repos): get_logger().info(f"Ignoring PR from repository '{repo_full_name}' due to 'config.ignore_repositories' setting") return False # logic to ignore PRs from specific users ignore_pr_users = get_settings().get("CONFIG.IGNORE_PR_AUTHORS", []) if ignore_pr_users and sender: if any(re.search(regex, sender) for regex in ignore_pr_users): get_logger().info(f"Ignoring PR from user '{sender}' due to 'config.ignore_pr_authors' setting") return False # logic to ignore PRs with specific titles if title: ignore_pr_title_re = get_settings().get("CONFIG.IGNORE_PR_TITLE", []) if not isinstance(ignore_pr_title_re, list): ignore_pr_title_re = [ignore_pr_title_re] if ignore_pr_title_re and any(re.search(regex, title) for regex in ignore_pr_title_re): get_logger().info(f"Ignoring PR with title '{title}' due to config.ignore_pr_title setting") return False # logic to ignore PRs with specific labels or source branches or target branches. ignore_pr_labels = get_settings().get("CONFIG.IGNORE_PR_LABELS", []) if pr_labels and ignore_pr_labels: labels = [label['name'] for label in pr_labels] if any(label in ignore_pr_labels for label in labels): labels_str = ", ".join(labels) get_logger().info(f"Ignoring PR with labels '{labels_str}' due to config.ignore_pr_labels settings") return False # logic to ignore PRs with specific source or target branches ignore_pr_source_branches = get_settings().get("CONFIG.IGNORE_PR_SOURCE_BRANCHES", []) ignore_pr_target_branches = get_settings().get("CONFIG.IGNORE_PR_TARGET_BRANCHES", []) if pull_request and (ignore_pr_source_branches or ignore_pr_target_branches): if any(re.search(regex, source_branch) for regex in ignore_pr_source_branches): get_logger().info( f"Ignoring PR with source branch '{source_branch}' due to config.ignore_pr_source_branches settings") return False if any(re.search(regex, target_branch) for regex in ignore_pr_target_branches): get_logger().info( f"Ignoring PR with target branch '{target_branch}' due to config.ignore_pr_target_branches settings") return False except Exception as e: get_logger().error(f"Failed 'should_process_pr_logic': {e}") return True # FastAPI app setup middleware = [Middleware(RawContextMiddleware)] app = FastAPI(middleware=middleware) app.include_router(router) def start(): """Start the Gitea webhook server""" port = int(os.environ.get("PORT", "3000")) import uvicorn uvicorn.run(app, host="0.0.0.0", port=port) if __name__ == "__main__": start() ================================================ FILE: pr_agent/servers/github_action_runner.py ================================================ import asyncio import json import os from typing import Union from pr_agent.agent.pr_agent import PRAgent from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.git_providers.utils import apply_repo_settings from pr_agent.log import get_logger from pr_agent.servers.github_app import handle_line_comments from pr_agent.tools.pr_code_suggestions import PRCodeSuggestions from pr_agent.tools.pr_description import PRDescription from pr_agent.tools.pr_reviewer import PRReviewer def is_true(value: Union[str, bool]) -> bool: if isinstance(value, bool): return value if isinstance(value, str): return value.lower() == 'true' return False def get_setting_or_env(key: str, default: Union[str, bool] = None) -> Union[str, bool]: try: value = get_settings().get(key, default) except AttributeError: # TBD still need to debug why this happens on GitHub Actions value = os.getenv(key, None) or os.getenv(key.upper(), None) or os.getenv(key.lower(), None) or default return value async def run_action(): # Get environment variables GITHUB_EVENT_NAME = os.environ.get('GITHUB_EVENT_NAME') GITHUB_EVENT_PATH = os.environ.get('GITHUB_EVENT_PATH') OPENAI_KEY = os.environ.get('OPENAI_KEY') or os.environ.get('OPENAI.KEY') OPENAI_ORG = os.environ.get('OPENAI_ORG') or os.environ.get('OPENAI.ORG') GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN') # get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) # Check if required environment variables are set if not GITHUB_EVENT_NAME: print("GITHUB_EVENT_NAME not set") return if not GITHUB_EVENT_PATH: print("GITHUB_EVENT_PATH not set") return if not GITHUB_TOKEN: print("GITHUB_TOKEN not set") return # Set the environment variables in the settings if OPENAI_KEY: get_settings().set("OPENAI.KEY", OPENAI_KEY) else: # Might not be set if the user is using models not from OpenAI print("OPENAI_KEY not set") if OPENAI_ORG: get_settings().set("OPENAI.ORG", OPENAI_ORG) get_settings().set("GITHUB.USER_TOKEN", GITHUB_TOKEN) get_settings().set("GITHUB.DEPLOYMENT_TYPE", "user") enable_output = get_setting_or_env("GITHUB_ACTION_CONFIG.ENABLE_OUTPUT", True) get_settings().set("GITHUB_ACTION_CONFIG.ENABLE_OUTPUT", enable_output) # Load the event payload try: with open(GITHUB_EVENT_PATH, 'r') as f: event_payload = json.load(f) except json.decoder.JSONDecodeError as e: print(f"Failed to parse JSON: {e}") return try: get_logger().info("Applying repo settings") pr_url = event_payload.get("pull_request", {}).get("html_url") if pr_url: apply_repo_settings(pr_url) get_logger().info(f"enable_custom_labels: {get_settings().config.enable_custom_labels}") except Exception as e: get_logger().info(f"github action: failed to apply repo settings: {e}") # Append the response language in the extra instructions try: response_language = get_settings().config.get('response_language', 'en-us') if response_language.lower() != 'en-us': get_logger().info(f'User has set the response language to: {response_language}') lang_instruction_text = f"Your response MUST be written in the language corresponding to locale code: '{response_language}'. This is crucial." separator_text = "\n======\n\nIn addition, " for key in get_settings(): setting = get_settings().get(key) if str(type(setting)) == "": if key.lower() in ['pr_description', 'pr_code_suggestions', 'pr_reviewer']: if hasattr(setting, 'extra_instructions'): extra_instructions = setting.extra_instructions if lang_instruction_text not in str(extra_instructions): updated_instructions = ( str(extra_instructions) + separator_text + lang_instruction_text if extra_instructions else lang_instruction_text ) setting.extra_instructions = updated_instructions except Exception as e: get_logger().info(f"github action: failed to apply language-specific instructions: {e}") # Handle pull request opened event if GITHUB_EVENT_NAME == "pull_request" or GITHUB_EVENT_NAME == "pull_request_target": action = event_payload.get("action") # Retrieve the list of actions from the configuration pr_actions = get_settings().get("GITHUB_ACTION_CONFIG.PR_ACTIONS", ["opened", "reopened", "ready_for_review", "review_requested"]) if action in pr_actions: pr_url = event_payload.get("pull_request", {}).get("url") if pr_url: # legacy - supporting both GITHUB_ACTION and GITHUB_ACTION_CONFIG auto_review = get_setting_or_env("GITHUB_ACTION.AUTO_REVIEW", None) if auto_review is None: auto_review = get_setting_or_env("GITHUB_ACTION_CONFIG.AUTO_REVIEW", None) auto_describe = get_setting_or_env("GITHUB_ACTION.AUTO_DESCRIBE", None) if auto_describe is None: auto_describe = get_setting_or_env("GITHUB_ACTION_CONFIG.AUTO_DESCRIBE", None) auto_improve = get_setting_or_env("GITHUB_ACTION.AUTO_IMPROVE", None) if auto_improve is None: auto_improve = get_setting_or_env("GITHUB_ACTION_CONFIG.AUTO_IMPROVE", None) # Set the configuration for auto actions get_settings().config.is_auto_command = True # Set the flag to indicate that the command is auto get_settings().pr_description.final_update_message = False # No final update message when auto_describe is enabled get_logger().info(f"Running auto actions: auto_describe={auto_describe}, auto_review={auto_review}, auto_improve={auto_improve}") # invoke by default all three tools if auto_describe is None or is_true(auto_describe): await PRDescription(pr_url).run() if auto_review is None or is_true(auto_review): await PRReviewer(pr_url).run() if auto_improve is None or is_true(auto_improve): await PRCodeSuggestions(pr_url).run() else: get_logger().info(f"Skipping action: {action}") # Handle issue comment event elif GITHUB_EVENT_NAME == "issue_comment" or GITHUB_EVENT_NAME == "pull_request_review_comment": action = event_payload.get("action") if action in ["created", "edited"]: comment_body = event_payload.get("comment", {}).get("body") try: if GITHUB_EVENT_NAME == "pull_request_review_comment": if '/ask' in comment_body: comment_body = handle_line_comments(event_payload, comment_body) except Exception as e: get_logger().error(f"Failed to handle line comments: {e}") return if comment_body: is_pr = False disable_eyes = False # check if issue is pull request if event_payload.get("issue", {}).get("pull_request"): url = event_payload.get("issue", {}).get("pull_request", {}).get("url") is_pr = True elif event_payload.get("comment", {}).get("pull_request_url"): # for 'pull_request_review_comment url = event_payload.get("comment", {}).get("pull_request_url") is_pr = True disable_eyes = True else: url = event_payload.get("issue", {}).get("url") if url: body = comment_body.strip().lower() comment_id = event_payload.get("comment", {}).get("id") provider = get_git_provider()(pr_url=url) if is_pr: await PRAgent().handle_request( url, body, notify=lambda: provider.add_eyes_reaction( comment_id, disable_eyes=disable_eyes ) ) else: await PRAgent().handle_request(url, body) if __name__ == '__main__': asyncio.run(run_action()) ================================================ FILE: pr_agent/servers/github_app.py ================================================ import asyncio.locks import copy import os import re import uuid from typing import Any, Dict, Tuple import uvicorn from fastapi import APIRouter, FastAPI, HTTPException, Request, Response from starlette.background import BackgroundTasks from starlette.middleware import Middleware from starlette_context import context from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings, global_settings from pr_agent.git_providers import (get_git_provider, get_git_provider_with_context) from pr_agent.git_providers.git_provider import IncrementalPR from pr_agent.git_providers.utils import apply_repo_settings from pr_agent.identity_providers import get_identity_provider from pr_agent.identity_providers.identity_provider import Eligibility from pr_agent.log import LoggingFormat, get_logger, setup_logger from pr_agent.servers.utils import DefaultDictWithTimeout, verify_signature setup_logger(fmt=LoggingFormat.JSON, level=get_settings().get("CONFIG.LOG_LEVEL", "DEBUG")) base_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) build_number_path = os.path.join(base_path, "build_number.txt") if os.path.exists(build_number_path): with open(build_number_path) as f: build_number = f.read().strip() else: build_number = "unknown" router = APIRouter() @router.post("/api/v1/github_webhooks") async def handle_github_webhooks(background_tasks: BackgroundTasks, request: Request, response: Response): """ Receives and processes incoming GitHub webhook requests. Verifies the request signature, parses the request body, and passes it to the handle_request function for further processing. """ get_logger().debug("Received a GitHub webhook") body = await get_body(request) installation_id = body.get("installation", {}).get("id") context["installation_id"] = installation_id context["settings"] = copy.deepcopy(global_settings) context["git_provider"] = {} background_tasks.add_task(handle_request, body, event=request.headers.get("X-GitHub-Event", None)) return {} @router.post("/api/v1/marketplace_webhooks") async def handle_marketplace_webhooks(request: Request, response: Response): body = await get_body(request) get_logger().info(f'Request body:\n{body}') async def get_body(request): try: body = await request.json() except Exception as e: get_logger().error("Error parsing request body", artifact={'error': e}) raise HTTPException(status_code=400, detail="Error parsing request body") from e webhook_secret = getattr(get_settings().github, 'webhook_secret', None) if webhook_secret: body_bytes = await request.body() signature_header = request.headers.get('x-hub-signature-256', None) verify_signature(body_bytes, webhook_secret, signature_header) return body _duplicate_push_triggers = DefaultDictWithTimeout(ttl=get_settings().github_app.push_trigger_pending_tasks_ttl) _pending_task_duplicate_push_conditions = DefaultDictWithTimeout(asyncio.locks.Condition, ttl=get_settings().github_app.push_trigger_pending_tasks_ttl) async def handle_comments_on_pr(body: Dict[str, Any], event: str, sender: str, sender_id: str, action: str, log_context: Dict[str, Any], agent: PRAgent): if "comment" not in body: return {} comment_body = body.get("comment", {}).get("body") if comment_body and isinstance(comment_body, str) and not comment_body.lstrip().startswith("/"): if '/ask' in comment_body and comment_body.strip().startswith('> ![image]'): comment_body_split = comment_body.split('/ask') comment_body = '/ask' + comment_body_split[1] +' \n' +comment_body_split[0].strip().lstrip('>') get_logger().info(f"Reformatting comment_body so command is at the beginning: {comment_body}") else: get_logger().info("Ignoring comment not starting with /") return {} disable_eyes = False if "issue" in body and "pull_request" in body["issue"] and "url" in body["issue"]["pull_request"]: api_url = body["issue"]["pull_request"]["url"] elif "comment" in body and "pull_request_url" in body["comment"]: api_url = body["comment"]["pull_request_url"] try: if ('/ask' in comment_body and 'subject_type' in body["comment"] and body["comment"]["subject_type"] == "line"): # comment on a code line in the "files changed" tab comment_body = handle_line_comments(body, comment_body) disable_eyes = True except Exception as e: get_logger().error("Failed to get log context", artifact={'error': e}) else: return {} log_context["api_url"] = api_url comment_id = body.get("comment", {}).get("id") provider = get_git_provider_with_context(pr_url=api_url) with get_logger().contextualize(**log_context): if get_identity_provider().verify_eligibility("github", sender_id, api_url) is not Eligibility.NOT_ELIGIBLE: get_logger().info(f"Processing comment on PR {api_url=}, comment_body={comment_body}") await agent.handle_request(api_url, comment_body, notify=lambda: provider.add_eyes_reaction(comment_id, disable_eyes=disable_eyes)) else: get_logger().info(f"User {sender=} is not eligible to process comment on PR {api_url=}") async def handle_new_pr_opened(body: Dict[str, Any], event: str, sender: str, sender_id: str, action: str, log_context: Dict[str, Any], agent: PRAgent): title = body.get("pull_request", {}).get("title", "") pull_request, api_url = _check_pull_request_event(action, body, log_context) if not (pull_request and api_url): get_logger().info(f"Invalid PR event: {action=} {api_url=}") return {} if action in get_settings().github_app.handle_pr_actions: # ['opened', 'reopened', 'ready_for_review'] # logic to ignore PRs with specific titles (e.g. "[Auto] ...") apply_repo_settings(api_url) if get_identity_provider().verify_eligibility("github", sender_id, api_url) is not Eligibility.NOT_ELIGIBLE: await _perform_auto_commands_github("pr_commands", agent, body, api_url, log_context) else: get_logger().info(f"User {sender=} is not eligible to process PR {api_url=}") async def handle_push_trigger_for_new_commits(body: Dict[str, Any], event: str, sender: str, sender_id: str, action: str, log_context: Dict[str, Any], agent: PRAgent): pull_request, api_url = _check_pull_request_event(action, body, log_context) if not (pull_request and api_url): return {} apply_repo_settings(api_url) # we need to apply the repo settings to get the correct settings for the PR. This is quite expensive - a call to the git provider is made for each PR event. if not get_settings().github_app.handle_push_trigger: return {} # TODO: do we still want to get the list of commits to filter bot/merge commits? before_sha = body.get("before") after_sha = body.get("after") merge_commit_sha = pull_request.get("merge_commit_sha") if before_sha == after_sha: return {} if get_settings().github_app.push_trigger_ignore_merge_commits and after_sha == merge_commit_sha: return {} # Prevent triggering multiple times for subsequent push triggers when one is enough: # The first push will trigger the processing, and if there's a second push in the meanwhile it will wait. # Any more events will be discarded, because they will all trigger the exact same processing on the PR. # We let the second event wait instead of discarding it because while the first event was being processed, # more commits may have been pushed that led to the subsequent events, # so we keep just one waiting as a delegate to trigger the processing for the new commits when done waiting. current_active_tasks = _duplicate_push_triggers.setdefault(api_url, 0) max_active_tasks = 2 if get_settings().github_app.push_trigger_pending_tasks_backlog else 1 if current_active_tasks < max_active_tasks: # first task can enter, and second tasks too if backlog is enabled get_logger().info( f"Continue processing push trigger for {api_url=} because there are {current_active_tasks} active tasks" ) _duplicate_push_triggers[api_url] += 1 else: get_logger().info( f"Skipping push trigger for {api_url=} because another event already triggered the same processing" ) return {} async with _pending_task_duplicate_push_conditions[api_url]: if current_active_tasks == 1: # second task waits get_logger().info( f"Waiting to process push trigger for {api_url=} because the first task is still in progress" ) await _pending_task_duplicate_push_conditions[api_url].wait() get_logger().info(f"Finished waiting to process push trigger for {api_url=} - continue with flow") try: if get_identity_provider().verify_eligibility("github", sender_id, api_url) is not Eligibility.NOT_ELIGIBLE: get_logger().info(f"Performing incremental review for {api_url=} because of {event=} and {action=}") await _perform_auto_commands_github("push_commands", agent, body, api_url, log_context) finally: # release the waiting task block async with _pending_task_duplicate_push_conditions[api_url]: _pending_task_duplicate_push_conditions[api_url].notify(1) _duplicate_push_triggers[api_url] -= 1 def handle_closed_pr(body, event, action, log_context): pull_request = body.get("pull_request", {}) is_merged = pull_request.get("merged", False) if not is_merged: return api_url = pull_request.get("url", "") pr_statistics = get_git_provider()(pr_url=api_url).calc_pr_statistics(pull_request) log_context["api_url"] = api_url get_logger().info("PR-Agent statistics for closed PR", analytics=True, pr_statistics=pr_statistics, **log_context) def get_log_context(body, event, action, build_number): sender = "" sender_id = "" sender_type = "" try: sender = body.get("sender", {}).get("login") sender_id = body.get("sender", {}).get("id") sender_type = body.get("sender", {}).get("type") repo = body.get("repository", {}).get("full_name", "") git_org = body.get("organization", {}).get("login", "") installation_id = body.get("installation", {}).get("id", "") app_name = get_settings().get("CONFIG.APP_NAME", "Unknown") log_context = {"action": action, "event": event, "sender": sender, "server_type": "github_app", "request_id": uuid.uuid4().hex, "build_number": build_number, "app_name": app_name, "repo": repo, "git_org": git_org, "installation_id": installation_id} except Exception as e: get_logger().error(f"Failed to get log context", artifact={'error': e}) log_context = {} return log_context, sender, sender_id, sender_type def is_bot_user(sender, sender_type): try: # logic to ignore PRs opened by bot if get_settings().get("GITHUB_APP.IGNORE_BOT_PR", False) and sender_type == "Bot": if 'pr-agent' not in sender: get_logger().info(f"Ignoring PR from '{sender=}' because it is a bot") return True except Exception as e: get_logger().error(f"Failed 'is_bot_user' logic: {e}") return False def should_process_pr_logic(body) -> bool: try: pull_request = body.get("pull_request", {}) title = pull_request.get("title", "") pr_labels = pull_request.get("labels", []) source_branch = pull_request.get("head", {}).get("ref", "") target_branch = pull_request.get("base", {}).get("ref", "") sender = body.get("sender", {}).get("login") repo_full_name = body.get("repository", {}).get("full_name", "") # logic to ignore PRs from specific repositories ignore_repos = get_settings().get("CONFIG.IGNORE_REPOSITORIES", []) if ignore_repos and repo_full_name: if any(re.search(regex, repo_full_name) for regex in ignore_repos): get_logger().info(f"Ignoring PR from repository '{repo_full_name}' due to 'config.ignore_repositories' setting") return False # logic to ignore PRs from specific users ignore_pr_users = get_settings().get("CONFIG.IGNORE_PR_AUTHORS", []) if ignore_pr_users and sender: if any(re.search(regex, sender) for regex in ignore_pr_users): get_logger().info(f"Ignoring PR from user '{sender}' due to 'config.ignore_pr_authors' setting") return False # logic to ignore PRs with specific titles if title: ignore_pr_title_re = get_settings().get("CONFIG.IGNORE_PR_TITLE", []) if not isinstance(ignore_pr_title_re, list): ignore_pr_title_re = [ignore_pr_title_re] if ignore_pr_title_re and any(re.search(regex, title) for regex in ignore_pr_title_re): get_logger().info(f"Ignoring PR with title '{title}' due to config.ignore_pr_title setting") return False # logic to ignore PRs with specific labels or source branches or target branches. ignore_pr_labels = get_settings().get("CONFIG.IGNORE_PR_LABELS", []) if pr_labels and ignore_pr_labels: labels = [label['name'] for label in pr_labels] if any(label in ignore_pr_labels for label in labels): labels_str = ", ".join(labels) get_logger().info(f"Ignoring PR with labels '{labels_str}' due to config.ignore_pr_labels settings") return False # logic to ignore PRs with specific source or target branches ignore_pr_source_branches = get_settings().get("CONFIG.IGNORE_PR_SOURCE_BRANCHES", []) ignore_pr_target_branches = get_settings().get("CONFIG.IGNORE_PR_TARGET_BRANCHES", []) if pull_request and (ignore_pr_source_branches or ignore_pr_target_branches): if any(re.search(regex, source_branch) for regex in ignore_pr_source_branches): get_logger().info( f"Ignoring PR with source branch '{source_branch}' due to config.ignore_pr_source_branches settings") return False if any(re.search(regex, target_branch) for regex in ignore_pr_target_branches): get_logger().info( f"Ignoring PR with target branch '{target_branch}' due to config.ignore_pr_target_branches settings") return False except Exception as e: get_logger().error(f"Failed 'should_process_pr_logic': {e}") return True async def handle_request(body: Dict[str, Any], event: str): """ Handle incoming GitHub webhook requests. Args: body: The request body. event: The GitHub event type (e.g. "pull_request", "issue_comment", etc.). """ action = body.get("action") # "created", "opened", "reopened", "ready_for_review", "review_requested", "synchronize" get_logger().debug(f"Handling request with event: {event}, action: {action}") if not action: get_logger().debug(f"No action found in request body, exiting handle_request") return {} agent = PRAgent() log_context, sender, sender_id, sender_type = get_log_context(body, event, action, build_number) # logic to ignore PRs opened by bot, PRs with specific titles, labels, source branches, or target branches if is_bot_user(sender, sender_type) and 'check_run' not in body: get_logger().debug(f"Request ignored: bot user detected") return {} if action != 'created' and 'check_run' not in body: if not should_process_pr_logic(body): get_logger().debug(f"Request ignored: PR logic filtering") return {} if 'check_run' in body: # handle failed checks # get_logger().debug(f'Request body', artifact=body, event=event) # added inside handle_checks pass # handle comments on PRs elif action == 'created': get_logger().debug(f'Request body', artifact=body, event=event) await handle_comments_on_pr(body, event, sender, sender_id, action, log_context, agent) # handle new PRs elif event == 'pull_request' and action != 'synchronize' and action != 'closed': get_logger().debug(f'Request body', artifact=body, event=event) await handle_new_pr_opened(body, event, sender, sender_id, action, log_context, agent) elif event == "issue_comment" and 'edited' in action: pass # handle_checkbox_clicked # handle pull_request event with synchronize action - "push trigger" for new commits elif event == 'pull_request' and action == 'synchronize': await handle_push_trigger_for_new_commits(body, event, sender,sender_id, action, log_context, agent) elif event == 'pull_request' and action == 'closed': if get_settings().get("CONFIG.ANALYTICS_FOLDER", ""): handle_closed_pr(body, event, action, log_context) else: get_logger().info(f"event {event=} action {action=} does not require any handling") return {} def handle_line_comments(body: Dict, comment_body: [str, Any]) -> str: if not comment_body: return "" start_line = body["comment"]["start_line"] end_line = body["comment"]["line"] start_line = end_line if not start_line else start_line question = comment_body.replace('/ask', '').strip() diff_hunk = body["comment"]["diff_hunk"] get_settings().set("ask_diff_hunk", diff_hunk) path = body["comment"]["path"] side = body["comment"]["side"] comment_id = body["comment"]["id"] if '/ask' in comment_body: comment_body = f"/ask_line --line_start={start_line} --line_end={end_line} --side={side} --file_name={path} --comment_id={comment_id} {question}" return comment_body def _check_pull_request_event(action: str, body: dict, log_context: dict) -> Tuple[Dict[str, Any], str]: invalid_result = {}, "" pull_request = body.get("pull_request") if not pull_request: return invalid_result api_url = pull_request.get("url") if not api_url: return invalid_result log_context["api_url"] = api_url if pull_request.get("draft", True) or pull_request.get("state") != "open": return invalid_result if action in ("review_requested", "synchronize") and pull_request.get("created_at") == pull_request.get("updated_at"): # avoid double reviews when opening a PR for the first time return invalid_result return pull_request, api_url async def _perform_auto_commands_github(commands_conf: str, agent: PRAgent, body: dict, api_url: str, log_context: dict): apply_repo_settings(api_url) if commands_conf == "pr_commands" and get_settings().config.disable_auto_feedback: # auto commands for PR, and auto feedback is disabled get_logger().info(f"Auto feedback is disabled, skipping auto commands for PR {api_url=}") return if not should_process_pr_logic(body): # Here we already updated the configuration with the repo settings return {} commands = get_settings().get(f"github_app.{commands_conf}") if not commands: get_logger().info(f"New PR, but no auto commands configured") return get_settings().set("config.is_auto_command", True) for command in commands: split_command = command.split(" ") command = split_command[0] args = split_command[1:] other_args = update_settings_from_args(args) new_command = ' '.join([command] + other_args) get_logger().info(f"{commands_conf}. Performing auto command '{new_command}', for {api_url=}") await agent.handle_request(api_url, new_command) @router.get("/") async def root(): return {"status": "ok"} if get_settings().github_app.override_deployment_type: # Override the deployment type to app get_settings().set("GITHUB.DEPLOYMENT_TYPE", "app") # get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) middleware = [Middleware(RawContextMiddleware)] app = FastAPI(middleware=middleware) app.include_router(router) def start(): uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", "3000"))) if __name__ == '__main__': start() ================================================ FILE: pr_agent/servers/github_lambda_webhook.py ================================================ from fastapi import FastAPI from mangum import Mangum from starlette.middleware import Middleware from starlette_context.middleware import RawContextMiddleware from pr_agent.servers.github_app import router try: from pr_agent.config_loader import apply_secrets_manager_config apply_secrets_manager_config() except Exception as e: try: from pr_agent.log import get_logger get_logger().debug(f"AWS Secrets Manager initialization failed, falling back to environment variables: {e}") except: # Fail completely silently if log module is not available pass middleware = [Middleware(RawContextMiddleware)] app = FastAPI(middleware=middleware) app.include_router(router) handler = Mangum(app, lifespan="off") def lambda_handler(event, context): return handler(event, context) ================================================ FILE: pr_agent/servers/github_polling.py ================================================ import asyncio import multiprocessing import time import traceback from collections import deque from datetime import datetime, timezone import aiohttp import requests from pr_agent.agent.pr_agent import PRAgent from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.log import LoggingFormat, get_logger, setup_logger setup_logger(fmt=LoggingFormat.JSON, level=get_settings().get("CONFIG.LOG_LEVEL", "DEBUG")) NOTIFICATION_URL = "https://api.github.com/notifications" async def mark_notification_as_read(headers, notification, session): async with session.patch( f"https://api.github.com/notifications/threads/{notification['id']}", headers=headers) as mark_read_response: if mark_read_response.status != 205: get_logger().error( f"Failed to mark notification as read. Status code: {mark_read_response.status}") def now() -> str: """ Get the current UTC time in ISO 8601 format. Returns: str: The current UTC time in ISO 8601 format. """ now_utc = datetime.now(timezone.utc).isoformat() now_utc = now_utc.replace("+00:00", "Z") return now_utc async def async_handle_request(pr_url, rest_of_comment, comment_id, git_provider): agent = PRAgent() success = await agent.handle_request( pr_url, rest_of_comment, notify=lambda: git_provider.add_eyes_reaction(comment_id) ) return success def run_handle_request(pr_url, rest_of_comment, comment_id, git_provider): return asyncio.run(async_handle_request(pr_url, rest_of_comment, comment_id, git_provider)) def process_comment_sync(pr_url, rest_of_comment, comment_id): try: # Run the async handle_request in a separate function git_provider = get_git_provider()(pr_url=pr_url) success = run_handle_request(pr_url, rest_of_comment, comment_id, git_provider) except Exception as e: get_logger().error(f"Error processing comment: {e}", artifact={"traceback": traceback.format_exc()}) async def process_comment(pr_url, rest_of_comment, comment_id): try: git_provider = get_git_provider()(pr_url=pr_url) git_provider.set_pr(pr_url) agent = PRAgent() success = await agent.handle_request( pr_url, rest_of_comment, notify=lambda: git_provider.add_eyes_reaction(comment_id) ) get_logger().info(f"Finished processing comment for PR: {pr_url}") except Exception as e: get_logger().error(f"Error processing comment: {e}", artifact={"traceback": traceback.format_exc()}) async def is_valid_notification(notification, headers, handled_ids, session, user_id): try: if 'reason' in notification and notification['reason'] == 'mention': if 'subject' in notification and notification['subject']['type'] == 'PullRequest': pr_url = notification['subject']['url'] latest_comment = notification['subject']['latest_comment_url'] if not latest_comment or not isinstance(latest_comment, str): get_logger().debug(f"no latest_comment") return False, handled_ids async with session.get(latest_comment, headers=headers) as comment_response: check_prev_comments = False user_tag = "@" + user_id if comment_response.status == 200: comment = await comment_response.json() if 'id' in comment: if comment['id'] in handled_ids: get_logger().debug(f"comment['id'] in handled_ids") return False, handled_ids else: handled_ids.add(comment['id']) if 'user' in comment and 'login' in comment['user']: if comment['user']['login'] == user_id: get_logger().debug(f"comment['user']['login'] == user_id") check_prev_comments = True comment_body = comment.get('body', '') if not comment_body: get_logger().debug(f"no comment_body") check_prev_comments = True else: if user_tag not in comment_body: get_logger().debug(f"user_tag not in comment_body") check_prev_comments = True else: get_logger().info(f"Polling, pr_url: {pr_url}", artifact={"comment": comment_body}) if not check_prev_comments: return True, handled_ids, comment, comment_body, pr_url, user_tag else: # we could not find the user tag in the latest comment. Check previous comments # get all comments in the PR requests_url = f"{pr_url}/comments".replace("pulls", "issues") comments_response = requests.get(requests_url, headers=headers) comments = comments_response.json()[::-1] max_comment_to_scan = 4 for comment in comments[:max_comment_to_scan]: if 'user' in comment and 'login' in comment['user']: if comment['user']['login'] == user_id: continue comment_body = comment.get('body', '') if not comment_body: continue if user_tag in comment_body: get_logger().info("found user tag in previous comments") get_logger().info(f"Polling, pr_url: {pr_url}", artifact={"comment": comment_body}) return True, handled_ids, comment, comment_body, pr_url, user_tag get_logger().warning(f"Failed to fetch comments for PR: {pr_url}", artifact={"comments": comments}) return False, handled_ids return False, handled_ids except Exception as e: get_logger().exception(f"Error processing polling notification", artifact={"notification": notification, "error": e}) return False, handled_ids async def polling_loop(): """ Polls for notifications and handles them accordingly. """ handled_ids = set() since = [now()] last_modified = [None] git_provider = get_git_provider()() user_id = git_provider.get_user_id() get_settings().set("CONFIG.PUBLISH_OUTPUT_PROGRESS", False) get_settings().set("pr_description.publish_description_as_comment", True) try: deployment_type = get_settings().github.deployment_type token = get_settings().github.user_token except AttributeError: deployment_type = 'none' token = None if deployment_type != 'user': raise ValueError("Deployment mode must be set to 'user' to get notifications") if not token: raise ValueError("User token must be set to get notifications") async with aiohttp.ClientSession() as session: while True: try: await asyncio.sleep(5) headers = { "Accept": "application/vnd.github.v3+json", "Authorization": f"Bearer {token}" } params = { "participating": "true" } if since[0]: params["since"] = since[0] if last_modified[0]: headers["If-Modified-Since"] = last_modified[0] async with session.get(NOTIFICATION_URL, headers=headers, params=params) as response: if response.status == 200: if 'Last-Modified' in response.headers: last_modified[0] = response.headers['Last-Modified'] since[0] = None notifications = await response.json() if not notifications: continue get_logger().info(f"Received {len(notifications)} notifications") task_queue = deque() for notification in notifications: if not notification: continue # mark notification as read await mark_notification_as_read(headers, notification, session) handled_ids.add(notification['id']) output = await is_valid_notification(notification, headers, handled_ids, session, user_id) if output[0]: _, handled_ids, comment, comment_body, pr_url, user_tag = output rest_of_comment = comment_body.split(user_tag)[1].strip() comment_id = comment['id'] # Add to the task queue get_logger().info( f"Adding comment processing to task queue for PR, {pr_url}, comment_body: {comment_body}") task_queue.append((process_comment_sync, (pr_url, rest_of_comment, comment_id))) get_logger().info(f"Queued comment processing for PR: {pr_url}") else: get_logger().debug(f"Skipping comment processing for PR") max_allowed_parallel_tasks = 10 if task_queue: processes = [] for i, (func, args) in enumerate(task_queue): # Create parallel tasks p = multiprocessing.Process(target=func, args=args) processes.append(p) p.start() if i > max_allowed_parallel_tasks: get_logger().error( f"Dropping {len(task_queue) - max_allowed_parallel_tasks} tasks from polling session") break task_queue.clear() # Dont wait for all processes to complete. Move on to the next iteration # for p in processes: # p.join() elif response.status != 304: print(f"Failed to fetch notifications. Status code: {response.status}") except Exception as e: get_logger().error(f"Polling exception during processing of a notification: {e}", artifact={"traceback": traceback.format_exc()}) if __name__ == '__main__': asyncio.run(polling_loop()) ================================================ FILE: pr_agent/servers/gitlab_lambda_webhook.py ================================================ from fastapi import FastAPI from mangum import Mangum from starlette.middleware import Middleware from starlette_context.middleware import RawContextMiddleware from pr_agent.servers.gitlab_webhook import router try: from pr_agent.config_loader import apply_secrets_manager_config apply_secrets_manager_config() except Exception as e: try: from pr_agent.log import get_logger get_logger().debug(f"AWS Secrets Manager initialization failed, falling back to environment variables: {e}") except: # Fail completely silently if log module is not available pass middleware = [Middleware(RawContextMiddleware)] app = FastAPI(middleware=middleware) app.include_router(router) handler = Mangum(app, lifespan="off") def lambda_handler(event, context): return handler(event, context) ================================================ FILE: pr_agent/servers/gitlab_webhook.py ================================================ import copy import json import os import re from datetime import datetime import uvicorn from fastapi import APIRouter, FastAPI, Request, status from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse from starlette.background import BackgroundTasks from starlette.middleware import Middleware from starlette_context import context from starlette_context.middleware import RawContextMiddleware from pr_agent.agent.pr_agent import PRAgent from pr_agent.algo.utils import update_settings_from_args from pr_agent.config_loader import get_settings, global_settings from pr_agent.git_providers.utils import apply_repo_settings from pr_agent.log import LoggingFormat, get_logger, setup_logger from pr_agent.secret_providers import get_secret_provider from pr_agent.git_providers import get_git_provider_with_context setup_logger(fmt=LoggingFormat.JSON, level=get_settings().get("CONFIG.LOG_LEVEL", "DEBUG")) router = APIRouter() secret_provider = get_secret_provider() if get_settings().get("CONFIG.SECRET_PROVIDER") else None async def handle_request(api_url: str, body: str, log_context: dict, sender_id: str, notify=None): log_context["action"] = body log_context["event"] = "pull_request" if body == "/review" else "comment" log_context["api_url"] = api_url log_context["app_name"] = get_settings().get("CONFIG.APP_NAME", "Unknown") with get_logger().contextualize(**log_context): await PRAgent().handle_request(api_url, body, notify) async def _perform_commands_gitlab(commands_conf: str, agent: PRAgent, api_url: str, log_context: dict, data: dict): apply_repo_settings(api_url) if commands_conf == "pr_commands" and get_settings().config.disable_auto_feedback: # auto commands for PR, and auto feedback is disabled get_logger().info(f"Auto feedback is disabled, skipping auto commands for PR {api_url=}", **log_context) return if not should_process_pr_logic(data): # Here we already updated the configurations return commands = get_settings().get(f"gitlab.{commands_conf}", {}) get_settings().set("config.is_auto_command", True) for command in commands: try: split_command = command.split(" ") command = split_command[0] args = split_command[1:] other_args = update_settings_from_args(args) new_command = ' '.join([command] + other_args) get_logger().info(f"Performing command: {new_command}") with get_logger().contextualize(**log_context): await agent.handle_request(api_url, new_command) except Exception as e: get_logger().error(f"Failed to perform command {command}: {e}") def is_bot_user(data) -> bool: try: # logic to ignore bot users (unlike Github, no direct flag for bot users in gitlab) sender_name = data.get("user", {}).get("name", "unknown").lower() bot_indicators = ['codium', 'bot_', 'bot-', '_bot', '-bot'] if any(indicator in sender_name for indicator in bot_indicators): get_logger().info(f"Skipping GitLab bot user: {sender_name}") return True except Exception as e: get_logger().error(f"Failed 'is_bot_user' logic: {e}") return False def is_draft(data) -> bool: try: if 'draft' in data.get('object_attributes', {}): return data['object_attributes']['draft'] # for gitlab server version before 16 elif 'Draft:' in data.get('object_attributes', {}).get('title'): return True except Exception as e: get_logger().error(f"Failed 'is_draft' logic: {e}") return False def is_draft_ready(data) -> bool: try: if 'draft' in data.get('changes', {}): # Handle both boolean values and string values for compatibility previous = data['changes']['draft']['previous'] current = data['changes']['draft']['current'] # Convert to boolean if they're strings if isinstance(previous, str): previous = previous.lower() == 'true' if isinstance(current, str): current = current.lower() == 'true' if previous is True and current is False: return True # for gitlab server version before 16 elif 'title' in data.get('changes', {}): if 'Draft:' in data['changes']['title']['previous'] and 'Draft:' not in data['changes']['title']['current']: return True except Exception as e: get_logger().error(f"Failed 'is_draft_ready' logic: {e}") return False def should_process_pr_logic(data) -> bool: try: if not data.get('object_attributes', {}): return False title = data['object_attributes'].get('title') sender = data.get("user", {}).get("username", "") repo_full_name = data.get('project', {}).get('path_with_namespace', "") # logic to ignore PRs from specific repositories ignore_repos = get_settings().get("CONFIG.IGNORE_REPOSITORIES", []) if ignore_repos and repo_full_name: if any(re.search(regex, repo_full_name) for regex in ignore_repos): get_logger().info(f"Ignoring MR from repository '{repo_full_name}' due to 'config.ignore_repositories' setting") return False # logic to ignore PRs from specific users ignore_pr_users = get_settings().get("CONFIG.IGNORE_PR_AUTHORS", []) if ignore_pr_users and sender: if any(re.search(regex, sender) for regex in ignore_pr_users): get_logger().info(f"Ignoring PR from user '{sender}' due to 'config.ignore_pr_authors' settings") return False # logic to ignore MRs for titles, labels and source, target branches. ignore_mr_title = get_settings().get("CONFIG.IGNORE_PR_TITLE", []) ignore_mr_labels = get_settings().get("CONFIG.IGNORE_PR_LABELS", []) ignore_mr_source_branches = get_settings().get("CONFIG.IGNORE_PR_SOURCE_BRANCHES", []) ignore_mr_target_branches = get_settings().get("CONFIG.IGNORE_PR_TARGET_BRANCHES", []) # if ignore_mr_source_branches: source_branch = data['object_attributes'].get('source_branch') if any(re.search(regex, source_branch) for regex in ignore_mr_source_branches): get_logger().info( f"Ignoring MR with source branch '{source_branch}' due to gitlab.ignore_mr_source_branches settings") return False if ignore_mr_target_branches: target_branch = data['object_attributes'].get('target_branch') if any(re.search(regex, target_branch) for regex in ignore_mr_target_branches): get_logger().info( f"Ignoring MR with target branch '{target_branch}' due to gitlab.ignore_mr_target_branches settings") return False if ignore_mr_labels: labels = [label['title'] for label in data['object_attributes'].get('labels', [])] if any(label in ignore_mr_labels for label in labels): labels_str = ", ".join(labels) get_logger().info(f"Ignoring MR with labels '{labels_str}' due to gitlab.ignore_mr_labels settings") return False if ignore_mr_title: if any(re.search(regex, title) for regex in ignore_mr_title): get_logger().info(f"Ignoring MR with title '{title}' due to gitlab.ignore_mr_title settings") return False except Exception as e: get_logger().error(f"Failed 'should_process_pr_logic': {e}") return True @router.post("/webhook") async def gitlab_webhook(background_tasks: BackgroundTasks, request: Request): start_time = datetime.now() request_json = await request.json() context["settings"] = copy.deepcopy(global_settings) async def inner(data: dict): log_context = {"server_type": "gitlab_app"} get_logger().debug("Received a GitLab webhook") if request.headers.get("X-Gitlab-Token") and secret_provider: request_token = request.headers.get("X-Gitlab-Token") secret = secret_provider.get_secret(request_token) if not secret: get_logger().warning(f"Empty secret retrieved, request_token: {request_token}") return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) try: secret_dict = json.loads(secret) gitlab_token = secret_dict["gitlab_token"] log_context["token_id"] = secret_dict.get("token_name", secret_dict.get("id", "unknown")) context["settings"].gitlab.personal_access_token = gitlab_token except Exception as e: get_logger().error(f"Failed to validate secret {request_token}: {e}") return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) elif get_settings().get("GITLAB.SHARED_SECRET"): secret = get_settings().get("GITLAB.SHARED_SECRET") if not request.headers.get("X-Gitlab-Token") == secret: get_logger().error("Failed to validate secret") return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) else: get_logger().error("Failed to validate secret") return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) gitlab_token = get_settings().get("GITLAB.PERSONAL_ACCESS_TOKEN", None) if not gitlab_token: get_logger().error("No gitlab token found") return JSONResponse(status_code=status.HTTP_401_UNAUTHORIZED, content=jsonable_encoder({"message": "unauthorized"})) get_logger().info("GitLab data", artifact=data) sender = data.get("user", {}).get("username", "unknown") sender_id = data.get("user", {}).get("id", "unknown") # ignore bot users if is_bot_user(data): return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) log_context["sender"] = sender if data.get('object_kind') == 'merge_request': # ignore MRs based on title, labels, source and target branches if not should_process_pr_logic(data): return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) object_attributes = data.get('object_attributes', {}) if object_attributes.get('action') in ['open', 'reopen']: url = object_attributes.get('url') get_logger().info(f"New merge request: {url}") if is_draft(data): get_logger().info(f"Skipping draft MR: {url}") return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) await _perform_commands_gitlab("pr_commands", PRAgent(), url, log_context, data) # for push event triggered merge requests elif object_attributes.get('action') == 'update' and object_attributes.get('oldrev'): url = object_attributes.get('url') get_logger().info(f"New merge request: {url}") if is_draft(data): get_logger().info(f"Skipping draft MR: {url}") return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) # Apply repo settings before checking push commands or handle_push_trigger apply_repo_settings(url) commands_on_push = get_settings().get(f"gitlab.push_commands", {}) handle_push_trigger = get_settings().get(f"gitlab.handle_push_trigger", False) if not commands_on_push or not handle_push_trigger: get_logger().info("Push event, but no push commands found or push trigger is disabled") return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) get_logger().debug(f'A push event has been received: {url}') await _perform_commands_gitlab("push_commands", PRAgent(), url, log_context, data) # for draft to ready triggered merge requests elif object_attributes.get('action') == 'update' and is_draft_ready(data): url = object_attributes.get('url') get_logger().info(f"Draft MR is ready: {url}") # same as open MR await _perform_commands_gitlab("pr_commands", PRAgent(), url, log_context, data) elif data.get('object_kind') == 'note' and data.get('event_type') == 'note': # comment on MR if 'merge_request' in data: mr = data['merge_request'] url = mr.get('url') comment_id = data.get('object_attributes', {}).get('id') provider = get_git_provider_with_context(pr_url=url) get_logger().info(f"A comment has been added to a merge request: {url}") body = data.get('object_attributes', {}).get('note') if data.get('object_attributes', {}).get('type') == 'DiffNote' and '/ask' in body: # /ask_line body = handle_ask_line(body, data) await handle_request(url, body, log_context, sender_id, notify=lambda: provider.add_eyes_reaction(comment_id)) background_tasks.add_task(inner, request_json) end_time = datetime.now() get_logger().info(f"Processing time: {end_time - start_time}", request=request_json) return JSONResponse(status_code=status.HTTP_200_OK, content=jsonable_encoder({"message": "success"})) def handle_ask_line(body, data): try: line_range_ = data['object_attributes']['position']['line_range'] # if line_range_['start']['type'] == 'new': start_line = line_range_['start']['new_line'] end_line = line_range_['end']['new_line'] # else: # start_line = line_range_['start']['old_line'] # end_line = line_range_['end']['old_line'] question = body.replace('/ask', '').strip() path = data['object_attributes']['position']['new_path'] side = 'RIGHT' # if line_range_['start']['type'] == 'new' else 'LEFT' comment_id = data['object_attributes']["discussion_id"] get_logger().info("Handling line ") body = f"/ask_line --line_start={start_line} --line_end={end_line} --side={side} --file_name={path} --comment_id={comment_id} {question}" except Exception as e: get_logger().error(f"Failed to handle ask line comment: {e}") return body @router.get("/") async def root(): return {"status": "ok"} gitlab_url = get_settings().get("GITLAB.URL", None) if not gitlab_url: raise ValueError("GITLAB.URL is not set") get_settings().config.git_provider = "gitlab" middleware = [Middleware(RawContextMiddleware)] app = FastAPI(middleware=middleware) app.include_router(router) def start(): """ Start the GitLab webhook server. The server port can be configured via the PORT environment variable. Defaults to 3000 if PORT is not set or invalid. """ raw_port = os.environ.get("PORT") try: port = int(raw_port) if raw_port else 3000 if not (1 <= port <= 65535): raise ValueError(f"Port {port} is out of valid range") if raw_port: get_logger().info(f"Using custom PORT from environment: {port}") except ValueError as e: get_logger().warning(f"Invalid PORT environment variable ({e}), using default port 3000") port = 3000 uvicorn.run(app, host="0.0.0.0", port=port) if __name__ == '__main__': start() ================================================ FILE: pr_agent/servers/gunicorn_config.py ================================================ import multiprocessing import os # from prometheus_client import multiprocess # Sample Gunicorn configuration file. # # Server socket # # bind - The socket to bind. # # A string of the form: 'HOST', 'HOST:PORT', 'unix:PATH'. # An IP is a valid HOST. # # backlog - The number of pending connections. This refers # to the number of clients that can be waiting to be # served. Exceeding this number results in the client # getting an error when attempting to connect. It should # only affect servers under significant load. # # Must be a positive integer. Generally set in the 64-2048 # range. # # bind = '0.0.0.0:5000' bind = '0.0.0.0:3000' backlog = 2048 # # Worker processes # # workers - The number of worker processes that this server # should keep alive for handling requests. # # A positive integer generally in the 2-4 x $(NUM_CORES) # range. You'll want to vary this a bit to find the best # for your particular application's work load. # # worker_class - The type of workers to use. The default # sync class should handle most 'normal' types of work # loads. You'll want to read # http://docs.gunicorn.org/en/latest/design.html#choosing-a-worker-type # for information on when you might want to choose one # of the other worker classes. # # A string referring to a Python path to a subclass of # gunicorn.workers.base.Worker. The default provided values # can be seen at # http://docs.gunicorn.org/en/latest/settings.html#worker-class # # worker_connections - For the eventlet and gevent worker classes # this limits the maximum number of simultaneous clients that # a single process can handle. # # A positive integer generally set to around 1000. # # timeout - If a worker does not notify the master process in this # number of seconds it is killed and a new worker is spawned # to replace it. # # Generally set to thirty seconds. Only set this noticeably # higher if you're sure of the repercussions for sync workers. # For the non sync workers it just means that the worker # process is still communicating and is not tied to the length # of time required to handle a single request. # # keepalive - The number of seconds to wait for the next request # on a Keep-Alive HTTP connection. # # A positive integer. Generally set in the 1-5 seconds range. # if os.getenv('GUNICORN_WORKERS', None): workers = int(os.getenv('GUNICORN_WORKERS')) else: cores = multiprocessing.cpu_count() workers = cores * 2 + 1 worker_connections = 1000 timeout = 240 keepalive = 2 # # spew - Install a trace function that spews every line of Python # that is executed when running the server. This is the # nuclear option. # # True or False # spew = False # # Server mechanics # # daemon - Detach the main Gunicorn process from the controlling # terminal with a standard fork/fork sequence. # # True or False # # raw_env - Pass environment variables to the execution environment. # # pidfile - The path to a pid file to write # # A path string or None to not write a pid file. # # user - Switch worker processes to run as this user. # # A valid user id (as an integer) or the name of a user that # can be retrieved with a call to pwd.getpwnam(value) or None # to not change the worker process user. # # group - Switch worker process to run as this group. # # A valid group id (as an integer) or the name of a user that # can be retrieved with a call to pwd.getgrnam(value) or None # to change the worker processes group. # # umask - A mask for file permissions written by Gunicorn. Note that # this affects unix socket permissions. # # A valid value for the os.umask(mode) call or a string # compatible with int(value, 0) (0 means Python guesses # the base, so values like "0", "0xFF", "0022" are valid # for decimal, hex, and octal representations) # # tmp_upload_dir - A directory to store temporary request data when # requests are read. This will most likely be disappearing soon. # # A path to a directory where the process owner can write. Or # None to signal that Python should choose one on its own. # daemon = False raw_env = [] pidfile = None umask = 0 user = None group = None tmp_upload_dir = None # # Logging # # logfile - The path to a log file to write to. # # A path string. "-" means log to stdout. # # loglevel - The granularity of log output # # A string of "debug", "info", "warning", "error", "critical" # errorlog = '-' loglevel = 'info' accesslog = None access_log_format = '%(h)s %(l)s %(u)s %(t)s "%(r)s" %(s)s %(b)s "%(f)s" "%(a)s"' # # Process naming # # proc_name - A base to use with setproctitle to change the way # that Gunicorn processes are reported in the system process # table. This affects things like 'ps' and 'top'. If you're # going to be running more than one instance of Gunicorn you'll # probably want to set a name to tell them apart. This requires # that you install the setproctitle module. # # A string or None to choose a default of something like 'gunicorn'. # proc_name = None # # Server hooks # # post_fork - Called just after a worker has been forked. # # A callable that takes a server and worker instance # as arguments. # # pre_fork - Called just prior to forking the worker subprocess. # # A callable that accepts the same arguments as after_fork # # pre_exec - Called just prior to forking off a secondary # master process during things like config reloading. # # A callable that takes a server instance as the sole argument. # ================================================ FILE: pr_agent/servers/help.py ================================================ class HelpMessage: @staticmethod def get_general_commands_text(): commands_text = "> - **/review**: Request a review of your Pull Request. \n" \ "> - **/describe**: Update the PR title and description based on the contents of the PR. \n" \ "> - **/improve [--extended]**: Suggest code improvements. Extended mode provides a higher quality feedback. \n" \ "> - **/ask \\**: Ask a question about the PR. \n" \ "> - **/update_changelog**: Update the changelog based on the PR's contents. \n" \ "> - **/help_docs \\**: Given a path to documentation (either for this repository or for a given one), ask a question. \n" \ "> - **/add_docs**: Generate docstring for new components introduced in the PR. \n" \ "> - **/generate_labels**: Generate labels for the PR based on the PR's contents. \n\n" \ ">See the [tools guide](https://pr-agent-docs.codium.ai/tools/) for more details.\n" \ ">To list the possible configuration parameters, add a **/config** comment. \n" return commands_text @staticmethod def get_general_bot_help_text(): output = f"> To invoke the PR-Agent, add a comment using one of the following commands: \n{HelpMessage.get_general_commands_text()} \n" return output @staticmethod def get_review_usage_guide(): output ="**Overview:**\n" output +=("The `review` tool scans the PR code changes, and generates a PR review which includes several types of feedbacks, such as possible PR issues, security threats and relevant test in the PR. More feedbacks can be [added](https://pr-agent-docs.codium.ai/tools/review/#general-configurations) by configuring the tool.\n\n" "The tool can be triggered [automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) every time a new PR is opened, or can be invoked manually by commenting on any PR.\n") output +="""\ - When commenting, to edit [configurations](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L23) related to the review tool (`pr_reviewer` section), use the following template: ``` /review --pr_reviewer.some_config1=... --pr_reviewer.some_config2=... ``` - With a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/), use the following template: ``` [pr_reviewer] some_config1=... some_config2=... ``` """ output += f"\n\nSee the review [usage page](https://pr-agent-docs.codium.ai/tools/review/) for a comprehensive guide on using this tool.\n\n" return output @staticmethod def get_describe_usage_guide(): output = "**Overview:**\n" output += "The `describe` tool scans the PR code changes, and generates a description for the PR - title, type, summary, walkthrough and labels. " output += "The tool can be triggered [automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) every time a new PR is opened, or can be invoked manually by commenting on a PR.\n" output += """\ When commenting, to edit [configurations](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L46) related to the describe tool (`pr_description` section), use the following template: ``` /describe --pr_description.some_config1=... --pr_description.some_config2=... ``` With a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/), use the following template: ``` [pr_description] some_config1=... some_config2=... ``` """ output += "\n\n" # automation output += "\n\n" # custom labels output += "\n\n" # extra instructions output += "
      Enabling\\disabling automation
      \n\n" output += """\ - When you first install the app, the [default mode](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) for the describe tool is: ``` pr_commands = ["/describe", ...] ``` meaning the `describe` tool will run automatically on every PR. - Markers are an alternative way to control the generated description, to give maximal control to the user. If you set: ``` pr_commands = ["/describe --pr_description.use_description_markers=true", ...] ``` the tool will replace every marker of the form `pr_agent:marker_name` in the PR description with the relevant content, where `marker_name` is one of the following: - `type`: the PR type. - `summary`: the PR summary. - `walkthrough`: the PR walkthrough. - `diagram`: the PR sequence diagram (if enabled). Note that when markers are enabled, if the original PR description does not contain any markers, the tool will not alter the description at all. """ output += "\n\n
      Custom labels
      \n\n" output += """\ The default labels of the `describe` tool are quite generic: [`Bug fix`, `Tests`, `Enhancement`, `Documentation`, `Other`]. If you specify [custom labels](https://pr-agent-docs.codium.ai/tools/describe/#handle-custom-labels-from-the-repos-labels-page) in the repo's labels page or via configuration file, you can get tailored labels for your use cases. Examples for custom labels: - `Main topic:performance` - pr_agent:The main topic of this PR is performance - `New endpoint` - pr_agent:A new endpoint was added in this PR - `SQL query` - pr_agent:A new SQL query was added in this PR - `Dockerfile changes` - pr_agent:The PR contains changes in the Dockerfile - ... The list above is eclectic, and aims to give an idea of different possibilities. Define custom labels that are relevant for your repo and use cases. Note that Labels are not mutually exclusive, so you can add multiple label categories. Make sure to provide proper title, and a detailed and well-phrased description for each label, so the tool will know when to suggest it. """ output += "\n\n
      Utilizing extra instructions
      \n\n" output += '''\ The `describe` tool can be configured with extra instructions, to guide the model to a feedback tailored to the needs of your project. Be specific, clear, and concise in the instructions. With extra instructions, you are the prompter. Notice that the general structure of the description is fixed, and cannot be changed. Extra instructions can change the content or style of each sub-section of the PR description. Examples for extra instructions: ``` [pr_description] extra_instructions="""\ - The PR title should be in the format: ': ' - The title should be short and concise (up to 10 words) - ... """ ``` Use triple quotes to write multi-line instructions. Use bullet points to make the instructions more readable. ''' output += "\n\n</details></td></tr>\n\n" # general output += "\n\n<tr><td><details> <summary><strong> More PR-Agent commands</strong></summary><hr> \n\n" output += HelpMessage.get_general_bot_help_text() output += "\n\n</details></td></tr>\n\n" output += "</table>" output += f"\n\nSee the [describe usage](https://pr-agent-docs.codium.ai/tools/describe/) page for a comprehensive guide on using this tool.\n\n" return output @staticmethod def get_ask_usage_guide(): output = "**Overview:**\n" output += """\ The `ask` tool answers questions about the PR, based on the PR code changes. It can be invoked manually by commenting on any PR: ``` /ask "..." ``` Note that the tool does not have "memory" of previous questions, and answers each question independently. You can ask questions about the entire PR, about specific code lines, or about an image related to the PR code changes. """ # output += "\n\n<table>" # # # # general # # output += "\n\n<tr><td><details> <summary><strong> More PR-Agent commands</strong></summary><hr> \n\n" # # output += HelpMessage.get_general_bot_help_text() # # output += "\n\n</details></td></tr>\n\n" # # output += "</table>" output += f"\n\nSee the [ask usage](https://pr-agent-docs.codium.ai/tools/ask/) page for a comprehensive guide on using this tool.\n\n" return output @staticmethod def get_improve_usage_guide(): output = "**Overview:**\n" output += "The code suggestions tool, named `improve`, scans the PR code changes, and automatically generates code suggestions for improving the PR." output += "The tool can be triggered [automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) every time a new PR is opened, or can be invoked manually by commenting on a PR.\n" output += """\ - When commenting, to edit [configurations](https://github.com/Codium-ai/pr-agent/blob/main/pr_agent/settings/configuration.toml#L78) related to the improve tool (`pr_code_suggestions` section), use the following template: ``` /improve --pr_code_suggestions.some_config1=... --pr_code_suggestions.some_config2=... ``` - With a [configuration file](https://pr-agent-docs.codium.ai/usage-guide/configuration_options/), use the following template: ``` [pr_code_suggestions] some_config1=... some_config2=... ``` """ output += f"\n\nSee the improve [usage page](https://pr-agent-docs.codium.ai/tools/improve/) for a comprehensive guide on using this tool.\n\n" return output @staticmethod def get_help_docs_usage_guide(): output = "**Overview:**\n" output += """\ The help docs tool, named `help_docs`, answers a question based on a given relative path of documentation, either from the repository of this merge request or from a given one." It can be invoked manually by commenting on any PR: ``` /help_docs "..." ``` """ output += f"\n\nSee the [help_docs usage](https://pr-agent-docs.codium.ai/tools/help_docs/) page for a comprehensive guide on using this tool.\n\n" return output ================================================ FILE: pr_agent/servers/utils.py ================================================ import hashlib import hmac import time from collections import defaultdict from typing import Any, Callable from fastapi import HTTPException def verify_signature(payload_body, secret_token, signature_header): """Verify that the payload was sent from GitHub by validating SHA256. Raise and return 403 if not authorized. Args: payload_body: original request body to verify (request.body()) secret_token: GitHub app webhook token (WEBHOOK_SECRET) signature_header: header received from GitHub (x-hub-signature-256) """ if not signature_header: raise HTTPException(status_code=403, detail="x-hub-signature-256 header is missing!") hash_object = hmac.new(secret_token.encode('utf-8'), msg=payload_body, digestmod=hashlib.sha256) expected_signature = "sha256=" + hash_object.hexdigest() if not hmac.compare_digest(expected_signature, signature_header): raise HTTPException(status_code=403, detail="Request signatures didn't match!") class RateLimitExceeded(Exception): """Raised when the git provider API rate limit has been exceeded.""" pass class DefaultDictWithTimeout(defaultdict): """A defaultdict with a time-to-live (TTL).""" def __init__( self, default_factory: Callable[[], Any] = None, ttl: int = None, refresh_interval: int = 60, update_key_time_on_get: bool = True, *args, **kwargs, ): """ Args: default_factory: The default factory to use for keys that are not in the dictionary. ttl: The time-to-live (TTL) in seconds. refresh_interval: How often to refresh the dict and delete items older than the TTL. update_key_time_on_get: Whether to update the access time of a key also on get (or only when set). """ super().__init__(default_factory, *args, **kwargs) self.__key_times = dict() self.__ttl = ttl self.__refresh_interval = refresh_interval self.__update_key_time_on_get = update_key_time_on_get self.__last_refresh = self.__time() - self.__refresh_interval @staticmethod def __time(): return time.monotonic() def __refresh(self): if self.__ttl is None: return request_time = self.__time() if request_time - self.__last_refresh > self.__refresh_interval: return to_delete = [key for key, key_time in self.__key_times.items() if request_time - key_time > self.__ttl] for key in to_delete: del self[key] self.__last_refresh = request_time def __getitem__(self, __key): if self.__update_key_time_on_get: self.__key_times[__key] = self.__time() self.__refresh() return super().__getitem__(__key) def __setitem__(self, __key, __value): self.__key_times[__key] = self.__time() return super().__setitem__(__key, __value) def __delitem__(self, __key): del self.__key_times[__key] return super().__delitem__(__key) ================================================ FILE: pr_agent/settings/.secrets_template.toml ================================================ # QUICKSTART: # Copy this file to .secrets.toml in the same folder. # The minimum workable settings - set openai.key to your API key. # Set github.deployment_type to "user" and github.user_token to your GitHub personal access token. # This will allow you to run the CLI scripts in the scripts/ folder and the github_polling server. # # See README for details about GitHub App deployment. [openai] key = "" # Acquire through https://platform.openai.com #org = "<ORGANIZATION>" # Optional, may be commented out. # Uncomment the following for Azure OpenAI #api_type = "azure" #api_version = '2023-05-15' # Check Azure documentation for the current API version #api_base = "" # The base URL for your Azure OpenAI resource. e.g. "https://<your resource name>.openai.azure.com" #deployment_id = "" # The deployment name you chose when you deployed the engine #fallback_deployments = [] # For each fallback model specified in configuration.toml in the [config] section, specify the appropriate deployment_id # OpenAI Flex Processing (optional, for cost savings) # [litellm] # extra_body='{"processing_mode": "flex"}' # model_id = "" # Optional: Custom inference profile ID for Amazon Bedrock [pinecone] api_key = "..." environment = "gcp-starter" [qdrant] # For Qdrant Cloud or self-hosted Qdrant url = "" # e.g., https://xxxxxxxx-xxxxxxxx.eu-central-1-0.aws.cloud.qdrant.io api_key = "" [anthropic] key = "" # Optional, uncomment if you want to use Anthropic. Acquire through https://www.anthropic.com/ [cohere] key = "" # Optional, uncomment if you want to use Cohere. Acquire through https://dashboard.cohere.ai/ [replicate] key = "" # Optional, uncomment if you want to use Replicate. Acquire through https://replicate.com/ [groq] key = "" # Acquire through https://console.groq.com/keys [xai] key = "" # Optional, uncomment if you want to use xAI. Acquire through https://console.x.ai/ [huggingface] key = "" # Optional, uncomment if you want to use Huggingface Inference API. Acquire through https://huggingface.co/docs/api-inference/quicktour api_base = "" # the base url for your huggingface inference endpoint [ollama] api_base = "" # the base url for your local Llama 2, Code Llama, and other models inference endpoint. Acquire through https://ollama.ai/ [vertexai] vertex_project = "" # the google cloud platform project name for your vertexai deployment vertex_location = "" # the google cloud platform location for your vertexai deployment [google_ai_studio] gemini_api_key = "" # the google AI Studio API key [github] # ---- Set the following only for deployment type == "user" user_token = "" # A GitHub personal access token with 'repo' scope. deployment_type = "user" #set to user by default # ---- Set the following only for deployment type == "app", see README for details. private_key = """\ -----BEGIN RSA PRIVATE KEY----- <GITHUB PRIVATE KEY> -----END RSA PRIVATE KEY----- """ app_id = 123456 # The GitHub App ID, replace with your own. webhook_secret = "<WEBHOOK SECRET>" # Optional, may be commented out. [gitlab] # Gitlab personal access token personal_access_token = "" shared_secret = "" # webhook secret [gitea] # Gitea personal access token personal_access_token="" webhook_secret="" # webhook secret [bitbucket] # For Bitbucket authentication auth_type = "bearer" # "bearer" or "basic" # For bearer token authentication bearer_token = "" # For basic authentication (uses token only) basic_token = "" [bitbucket_server] # For Bitbucket Server bearer token bearer_token = "" webhook_secret = "" # For Bitbucket app app_key = "" url = "" [azure_devops] # For Azure devops personal access token org = "" pat = "" [azure_devops_server] # For Azure devops Server basic auth - configured in the webhook creation # Optional, uncomment if you want to use Azure devops webhooks. Value assinged when you create the webhook # webhook_username = "<basic auth user>" # webhook_password = "<basic auth password>" [deepseek] key = "" [deepinfra] key = "" [azure_ad] # Azure AD authentication for OpenAI services client_id = "" # Your Azure AD application client ID client_secret = "" # Your Azure AD application client secret tenant_id = "" # Your Azure AD tenant ID api_base = "" # Your Azure OpenAI service base URL (e.g., https://openai.xyz.com/) [openrouter] key = "" api_base = "" [aws] AWS_ACCESS_KEY_ID = "" AWS_SECRET_ACCESS_KEY = "" AWS_REGION_NAME = "" [aws_secrets_manager] secret_arn = "" # The ARN of the AWS Secrets Manager secret containing PR-Agent configuration region_name = "" # Optional: specific AWS region (defaults to AWS_REGION_NAME or Lambda region) ================================================ FILE: pr_agent/settings/code_suggestions/pr_code_suggestions_prompts.toml ================================================ [pr_code_suggestions_prompt] system="""You are PR-Reviewer, an AI specializing in Pull Request (PR) code analysis and suggestions. {%- if not focus_only_on_problems %} Your task is to examine the provided code diff, focusing on new code (lines prefixed with '+'), and offer concise, actionable suggestions to fix possible bugs and problems, and enhance code quality and performance. {%- else %} Your task is to examine the provided code diff, focusing on new code (lines prefixed with '+'), and offer concise, actionable suggestions to fix critical bugs and problems. {%- endif %} The PR code diff will be in the following structured format: ====== ## File: 'src/file1.py' {%- if is_ai_metadata %} ### AI-generated changes summary: * ... * ... {%- endif %} @@ ... @@ def func1(): __new hunk__ unchanged code line0 unchanged code line1 +new code line2 added unchanged code line3 __old hunk__ unchanged code line0 unchanged code line1 -old code line2 removed unchanged code line3 @@ ... @@ def func2(): __new hunk__ unchanged code line4 +new code line5 added unchanged code line6 ## File: 'src/file2.py' ... ====== Important notes about the structured diff format above: 1. Each PR code chunk is decoupled into separate '__new hunk__' and '__old hunk__' sections: - The '__new hunk__' section shows the code chunk AFTER the PR changes. - The '__old hunk__' section shows the code chunk BEFORE the PR changes. If no code was removed from the chunk, the '__old hunk__' section will be omitted. 2. The diff uses line prefixes to show changes: '+' → new line code added (will appear only in '__new hunk__') '-' → line code removed (will appear only in '__old hunk__') ' ' → unchanged context lines (will appear in both sections) {%- if is_ai_metadata %} 3. When available, an AI-generated summary will precede each file's diff, with a high-level overview of the changes. Note that this summary may not be fully accurate or complete. {%- endif %} Specific guidelines for generating code suggestions: {%- if not focus_only_on_problems %} - Provide up to {{ num_code_suggestions }} distinct and insightful code suggestions. {%- else %} - Provide up to {{ num_code_suggestions }} distinct and insightful code suggestions. Return less suggestions if no pertinent ones are applicable. {%- endif %} - DO NOT suggest implementing changes that are already present in the '+' lines compared to the '-' lines. - Focus your suggestions ONLY on new code introduced in the PR ('+' lines in '__new hunk__' sections). {%- if not focus_only_on_problems %} - Prioritize suggestions that address potential issues, critical problems, and bugs in the PR code. Avoid repeating changes already implemented in the PR. If no pertinent suggestions are applicable, return an empty list. - Don't suggest to add docstring, type hints, or comments, to remove unused imports, or to use more specific exception types. {%- else %} - Only give suggestions that address critical problems and bugs in the PR code. If no relevant suggestions are applicable, return an empty list. - DO NOT suggest the following: - change packages version - add missing import statement - declare undefined variable, or remove unused variable - use more specific exception types - repeat changes already done in the PR code {%- endif %} - Be aware that your input consists only of partial code segments (PR diff code), not the complete codebase. Therefore, avoid making suggestions that might duplicate existing functionality, and refrain from questioning code elements (such as variable declarations or import statements) that may be defined elsewhere in the codebase. - When mentioning code elements (variables, names, or files) in your response, surround them with backticks (`). For example: "verify that `user_id` is..." {%- if extra_instructions %} Extra user-provided instructions (should be addressed with high priority): ====== {{ extra_instructions }} ====== {%- endif %} The output must be a YAML object equivalent to type $PRCodeSuggestions, according to the following Pydantic definitions: ===== class CodeSuggestion(BaseModel): relevant_file: str = Field(description="Full path of the relevant file") language: str = Field(description="Programming language used by the relevant file") existing_code: str = Field(description="A short code snippet, from a '__new hunk__' section after the PR changes, that the suggestion aims to enhance or fix. Include only complete code lines. Use ellipsis (...) for brevity if needed. This snippet should represent the specific PR code targeted for improvement.") suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Don't present here actual code snippets, just the suggestion. Be short and concise") improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.") one_sentence_summary: str = Field(description="A concise, single-sentence overview (up to 6 words) of the suggested improvement. Focus on the 'what'. Be general, and avoid method or variable names.") {%- if not focus_only_on_problems %} label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability', 'typo'. Other relevant labels are also acceptable.") {%- else %} label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'critical bug', 'general'. The 'general' section should be used for suggestions that address a major issue, but are not necessarily on a critical level.") {%- endif %} class PRCodeSuggestions(BaseModel): code_suggestions: List[CodeSuggestion] ===== Example output: ```yaml code_suggestions: - relevant_file: | src/file1.py language: | python existing_code: | ... suggestion_content: | ... improved_code: | ... one_sentence_summary: | ... label: | ... ``` Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). """ user="""--PR Info-- Title: '{{title}}' {%- if date %} Today's Date: {{date}} {%- endif %} The PR Diff: ====== {{ diff_no_line_numbers|trim }} ====== {%- if duplicate_prompt_examples %} Example output: ```yaml code_suggestions: - relevant_file: | src/file1.py language: | python existing_code: | ... suggestion_content: | ... improved_code: | ... one_sentence_summary: | ... label: | ... ``` (replace '...' with actual content) {%- endif %} Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/code_suggestions/pr_code_suggestions_prompts_not_decoupled.toml ================================================ [pr_code_suggestions_prompt_not_decoupled] system="""You are PR-Reviewer, an AI specializing in Pull Request (PR) code analysis and suggestions. {%- if not focus_only_on_problems %} Your task is to examine the provided code diff, focusing on new code (lines prefixed with '+'), and offer concise, actionable suggestions to fix possible bugs and problems, and enhance code quality and performance. {%- else %} Your task is to examine the provided code diff, focusing on new code (lines prefixed with '+'), and offer concise, actionable suggestions to fix critical bugs and problems. {%- endif %} The PR code diff will be in the following structured format: ====== ## File: 'src/file1.py' {%- if is_ai_metadata %} ### AI-generated changes summary: * ... * ... {%- endif %} @@ ... @@ def func1(): unchanged code line0 unchanged code line1 +new code line2 -removed code line2 unchanged code line3 @@ ... @@ def func2(): ... ## File: 'src/file2.py' ... ====== The diff structure above uses line prefixes to show changes: '+' → new line code added '-' → line code removed ' ' → unchanged context lines {%- if is_ai_metadata %} When available, an AI-generated summary will precede each file's diff, with a high-level overview of the changes. Note that this summary may not be fully accurate or complete. {%- endif %} Specific guidelines for generating code suggestions: {%- if not focus_only_on_problems %} - Provide up to {{ num_code_suggestions }} distinct and insightful code suggestions. {%- else %} - Provide up to {{ num_code_suggestions }} distinct and insightful code suggestions. Return less suggestions if no pertinent ones are applicable. {%- endif %} - Focus your suggestions ONLY on improving the new code introduced in the PR (lines starting with '+' in the diff). The lines in the diff starting with '-' are only for reference and should not be considered for suggestions. {%- if not focus_only_on_problems %} - Prioritize suggestions that address potential issues, critical problems, and bugs in the PR code. Avoid repeating changes already implemented in the PR. If no pertinent suggestions are applicable, return an empty list. - Don't suggest to add docstring, type hints, or comments, to remove unused imports, or to use more specific exception types. {%- else %} - Only give suggestions that address critical problems and bugs in the PR code. If no relevant suggestions are applicable, return an empty list. - DO NOT suggest the following: - change packages version - add missing import statement - declare undefined variable, add missing imports, etc. - use more specific exception types {%- endif %} - When mentioning code elements (variables, names, or files) in your response, surround them with markdown backticks (`). For example: "verify that `user_id` is..." - Note that you will only see partial code segments that were changed (diff hunks in a PR code), and not the entire codebase. Avoid suggestions that might duplicate existing functionality of the outer codebase. In addition, the absence of a definition, declaration, import, or initialization for any entity in the PR code is NEVER a basis for a suggestion. - Also note that if the code ends at an opening brace or statement that begins a new scope (like 'if', 'for', 'try'), don't treat it as incomplete. Instead, acknowledge the visible scope boundary and analyze only the code shown. {%- if extra_instructions %} Extra user-provided instructions (should be addressed with high priority): ====== {{ extra_instructions }} ====== {%- endif %} The output must be a YAML object equivalent to type $PRCodeSuggestions, according to the following Pydantic definitions: ===== class CodeSuggestion(BaseModel): relevant_file: str = Field(description="Full path of the relevant file") language: str = Field(description="Programming language used by the relevant file") existing_code: str = Field(description="A short code snippet, from the final state of the PR diff, that the suggestion will address. Select only the specific span of code that will be modified - without surrounding unchanged code. Preserve all indentation, newlines, and original formatting. Show the code snippet without the '+'/'-'/' ' prefixes. When providing suggestions for long code sections, shorten the presented code with ellipsis (...) for brevity where possible.") suggestion_content: str = Field(description="An actionable suggestion to enhance, improve or fix the new code introduced in the PR. Use 2-3 short sentences.") improved_code: str = Field(description="A refined code snippet that replaces the 'existing_code' snippet after implementing the suggestion.") one_sentence_summary: str = Field(description="A single-sentence overview (up to 6 words) of the suggestion. Focus on the 'what'. Be general, and avoid mentioning method or variable names.") {%- if not focus_only_on_problems %} label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'possible bug', 'possible issue', 'performance', 'enhancement', 'best practice', 'maintainability', 'typo'. Other relevant labels are also acceptable.") {%- else %} label: str = Field(description="A single, descriptive label that best characterizes the suggestion type. Possible labels include 'security', 'critical bug', 'general'. The 'general' section should be used for suggestions that address a major issue, but are not necessarily on a critical level.") {%- endif %} class PRCodeSuggestions(BaseModel): code_suggestions: List[CodeSuggestion] ===== Example output: ```yaml code_suggestions: - relevant_file: | src/file1.py language: | python existing_code: | ... suggestion_content: | ... improved_code: | ... one_sentence_summary: | ... label: | ... ``` Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). """ user="""--PR Info-- Title: '{{title}}' {%- if date %} Today's Date: {{date}} {%- endif %} The PR Diff: ====== {{ diff_no_line_numbers|trim }} ====== {%- if duplicate_prompt_examples %} Example output: ```yaml code_suggestions: - relevant_file: | src/file1.py language: | python existing_code: | ... suggestion_content: | ... improved_code: | ... one_sentence_summary: | ... label: | ... ``` (replace '...' with actual content) {%- endif %} Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/code_suggestions/pr_code_suggestions_reflect_prompts.toml ================================================ [pr_code_suggestions_reflect_prompt] system="""You are an AI language model specialized in reviewing and evaluating code suggestions for a Pull Request (PR). Your task is to analyze a PR code diff and evaluate the correctness and importance set of AI-generated code suggestions. In addition to evaluating the suggestion correctness and importance, another sub-task you have is to detect the line numbers in the '__new hunk__' of the PR code diff section that correspond to the 'existing_code' snippet. Examine each suggestion meticulously, assessing its quality, relevance, and accuracy within the context of PR. Keep in mind that the suggestions may vary in their correctness, accuracy and impact. Consider the following components of each suggestion: 1. 'one_sentence_summary' - A one-liner summary of the suggestion's purpose 2. 'suggestion_content' - The suggestion content, explaining the proposed modification 3. 'existing_code' - a code snippet from a __new hunk__ section in the PR code diff that the suggestion addresses 4. 'improved_code' - a code snippet demonstrating how the 'existing_code' should be after the suggestion is applied Be particularly vigilant for suggestions that: - Overlook crucial details in the PR code - The 'improved_code' section does not accurately reflect the suggested changes, in relation to the 'existing_code' - Contradict or ignore parts of the PR's modifications In such cases, assign the suggestion a score of 0. Evaluate each valid suggestion by scoring its potential impact on the PR's correctness, quality and functionality. Key guidelines for evaluation: - Thoroughly examine both the suggestion content and the corresponding PR code diff. Be vigilant for potential errors in each suggestion, ensuring they are logically sound, accurate, and directly derived from the PR code diff. - Extend your review beyond the specifically mentioned code lines to encompass surrounding PR code context, verifying the suggestions' contextual accuracy. - Validate the 'existing_code' field by confirming it matches or is accurately derived from code lines within a '__new hunk__' section of the PR code diff. - Ensure the 'improved_code' section accurately reflects the 'existing_code' segment after the suggested modification is applied. - Apply a nuanced scoring system: - Reserve high scores (8-10) for suggestions addressing critical issues such as major bugs or security concerns. - Assign moderate scores (3-7) to suggestions that tackle minor issues, improve code style, enhance readability, or boost maintainability. - Avoid inflating scores for suggestions that, while correct, offer only marginal improvements or optimizations. - Maintain the original order of suggestions in your feedback, corresponding to their input sequence. Additional scoring considerations: - If the suggestion only asks the user to verify or ensure a change done in the PR, it should not receive a score above 7 (and may be lower). - Error handling or type checking suggestions should not receive a score above 8 (and may be lower). - If the 'existing_code' snippet is equal to the 'improved_code' snippet, it should not receive a score above 7 (and may be lower). - Assume each suggestion is independent and is not influenced by the other suggestions. - Assign a score of 0 to suggestions aiming at: - Adding docstring, type hints, or comments - Remove unused imports or variables - Add missing import statements - Using more specific exception types. - Questions the definition, declaration, import, or initialization of any entity in the PR code, that might be done in the outer codebase. The PR code diff will be presented in the following structured format: ====== ## File: 'src/file1.py' {%- if is_ai_metadata %} ### AI-generated changes summary: * ... * ... {%- endif %} @@ ... @@ def func1(): __new hunk__ 11 unchanged code line0 12 unchanged code line1 13 +new code line2 added 14 unchanged code line3 __old hunk__ unchanged code line0 unchanged code line1 -old code line2 removed unchanged code line3 @@ ... @@ def func2(): __new hunk__ ... __old hunk__ ... ## File: 'src/file2.py' ... ====== - In the format above, the diff is organized into separate '__new hunk__' and '__old hunk__' sections for each code chunk. '__new hunk__' contains the updated code, while '__old hunk__' shows the removed code. If no code was added or removed in a specific chunk, the corresponding section will be omitted. - Line numbers are included for the '__new hunk__' sections to enable referencing specific lines in the code suggestions. These numbers are for reference only and are not part of the actual code. - Code lines are prefixed with symbols: '+' for new code added in the PR, '-' for code removed, and ' ' for unchanged code. {%- if is_ai_metadata %} - When available, an AI-generated summary will precede each file's diff, with a high-level overview of the changes. Note that this summary may not be fully accurate or comprehensive. {%- endif %} The output must be a YAML object equivalent to type $PRCodeSuggestionsFeedback, according to the following Pydantic definitions: ===== class CodeSuggestionFeedback(BaseModel): suggestion_summary: str = Field(description="Repeated from the input") relevant_file: str = Field(description="Repeated from the input") relevant_lines_start: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion starts (inclusive). Should be derived from the added '__new hunk__' line numbers, and correspond to the first line of the relevant 'existing code' snippet.") relevant_lines_end: int = Field(description="The relevant line number, from a '__new hunk__' section, where the suggestion ends (inclusive). Should be derived from the added '__new hunk__' line numbers, and correspond to the end of the relevant 'existing code' snippet") suggestion_score: int = Field(description="Evaluate the suggestion and assign a score from 0 to 10. Give 0 if the suggestion is wrong. For valid suggestions, score from 1 (lowest impact/importance) to 10 (highest impact/importance).") why: str = Field(description="Briefly explain the score given in 1-2 short sentences, focusing on the suggestion's impact, relevance, and accuracy. When mentioning code elements (variables, names, or files) in your response, surround them with markdown backticks (`).") class PRCodeSuggestionsFeedback(BaseModel): code_suggestions: List[CodeSuggestionFeedback] ===== Example output: ```yaml code_suggestions: - suggestion_summary: | Use a more descriptive variable name here relevant_file: "src/file1.py" relevant_lines_start: 13 relevant_lines_end: 14 suggestion_score: 6 why: | The variable name 't' is not descriptive enough - ... ``` Each YAML output MUST be after a newline, indented, with block scalar indicator ('|'). """ user="""You are given a Pull Request (PR) code diff: ====== {{ diff|trim }} ====== Below are {{ num_code_suggestions }} AI-generated code suggestions for the Pull Request: ====== {{ suggestion_str|trim }} ====== {%- if duplicate_prompt_examples %} Example output: ```yaml code_suggestions: - suggestion_summary: | ... relevant_file: "..." relevant_lines_start: ... relevant_lines_end: ... suggestion_score: ... why: | ... - ... ``` (replace '...' with actual content) {%- endif %} Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/configuration.toml ================================================ # Important: This file contains all available configuration options. # Do not copy this entire file to your repository configuration. # Your repository configuration should only include options you wish to override from the defaults. [config] # models model="gpt-5.4-2026-03-05" fallback_models=["o4-mini"] #model_reasoning="o4-mini" # dedicated reasoning model for self-reflection #model_weak="gpt-4o" # optional, a weaker model to use for some easier tasks # CLI git_provider="github" publish_output=true publish_output_progress=true verbosity_level=0 # 0,1,2 use_extra_bad_extensions=false # Log log_level="DEBUG" # Configurations use_wiki_settings_file=true use_repo_settings_file=true use_global_settings_file=true disable_auto_feedback = false ai_timeout=120 # 2minutes skip_keys = [] custom_reasoning_model = false # when true, disables system messages and temperature controls for models that don't support chat-style inputs response_language="en-US" # Language locales code for PR responses in ISO 3166 and ISO 639 format (e.g., "en-US", "it-IT", "zh-CN", ...) # token limits max_description_tokens = 500 max_commits_tokens = 500 max_model_tokens = 32000 # Limits the maximum number of tokens that can be used by any model, regardless of the model's default capabilities. custom_model_max_tokens=-1 # for models not in the default list model_token_count_estimate_factor=0.3 # factor to increase the token count estimate, in order to reduce likelihood of model failure due to too many tokens - applicable only when requesting an accurate estimate. # patch extension logic patch_extension_skip_types =[".md",".txt"] allow_dynamic_context=true max_extra_lines_before_dynamic_context = 10 # will try to include up to 10 extra lines before the hunk in the patch, until we reach an enclosing function or class patch_extra_lines_before = 5 # Number of extra lines (+3 default ones) to include before each hunk in the patch patch_extra_lines_after = 1 # Number of extra lines (+3 default ones) to include after each hunk in the patch secret_provider="" # "" (disabled), "google_cloud_storage", or "aws_secrets_manager" for secure secret management cli_mode=false output_relevant_configurations=false large_patch_policy = "clip" # "clip", "skip" duplicate_prompt_examples = false # seed seed=-1 # set positive value to fix the seed (and ensure temperature=0) temperature=0.2 # ignore logic ignore_pr_title = ["^\\[Auto\\]", "^Auto"] # a list of regular expressions to match against the PR title to ignore the PR agent ignore_pr_target_branches = [] # a list of regular expressions of target branches to ignore from PR agent when an PR is created ignore_pr_source_branches = [] # a list of regular expressions of source branches to ignore from PR agent when an PR is created ignore_pr_labels = [] # labels to ignore from PR agent when an PR is created ignore_pr_authors = [] # authors to ignore from PR agent when an PR is created ignore_repositories = [] # a list of regular expressions of repository full names (e.g. "org/repo") to ignore from PR agent processing ignore_language_framework = [] # a list of code-generation languages or frameworks (e.g. 'protobuf', 'go_gen') whose auto-generated source files will be excluded from analysis # is_auto_command = false # will be auto-set to true if the command is triggered by an automation enable_ai_metadata = false # will enable adding ai metadata reasoning_effort = "medium" # "low", "medium", "high" # extended thinking for Claude reasoning models enable_claude_extended_thinking = false # Set to true to enable extended thinking feature extended_thinking_budget_tokens = 2048 extended_thinking_max_output_tokens = 4096 # Extract issue number from PR source branch name (e.g. feature/1-auth-google -> issue #1). When true, branch-derived # issue URLs are merged with tickets from the PR description for compliance. Set to false to restore description-only behaviour. # Note: Branch-name extraction is GitHub-only for now; other providers planned for later. extract_issue_from_branch = true # Optional: custom regex with exactly one capturing group for the issue number (validated at runtime; falls back # to default if missing). If empty, uses default pattern: first 1-6 digits at start of branch or after a slash, # followed by hyphen or end (e.g. feature/1-test, 123-fix). GitHub only; other providers planned for later. branch_issue_regex = "" [pr_reviewer] # /review # # enable/disable features require_score_review=false require_tests_review=true require_estimate_effort_to_review=true require_can_be_split_review=false require_security_review=true require_estimate_contribution_time_cost=false require_todo_scan=false require_ticket_analysis_review=true # general options publish_output_no_suggestions=true # Set to "false" if you only need the reviewer's remarks (not labels, not "security audit", etc.) and want to avoid noisy "No major issues detected" comments. persistent_comment=true extra_instructions = "" num_max_findings = 3 final_update_message = true # review labels enable_review_labels_security=true enable_review_labels_effort=true # specific configurations for incremental review (/review -i) require_all_thresholds_for_incremental_review=false minimal_commits_for_incremental_review=0 minimal_minutes_for_incremental_review=0 enable_intro_text=true enable_help_text=false # Determines whether to include help text in the PR review. Enabled by default. [pr_description] # /describe # publish_labels=false add_original_user_description=true generate_ai_title=false use_bullet_points=true extra_instructions = "" enable_pr_type=true final_update_message = true enable_help_text=false enable_help_comment=false enable_pr_diagram=true # adds a section with a diagram of the PR changes # describe as comment publish_description_as_comment=false publish_description_as_comment_persistent=true ## changes walkthrough section enable_semantic_files_types=true collapsible_file_list='adaptive' # true, false, 'adaptive' collapsible_file_list_threshold=6 inline_file_summary=false # false, true, 'table' # markers use_description_markers=false enable_large_pr_handling=true include_generated_by_header=true #custom_labels = ['Bug fix', 'Tests', 'Bug fix with tests', 'Enhancement', 'Documentation', 'Other'] max_ai_calls=4 async_ai_calls=true [pr_questions] # /ask # enable_help_text=false use_conversation_history=true [pr_code_suggestions] # /improve # commitable_code_suggestions = false dual_publishing_score_threshold=-1 # -1 to disable, [0-10] to set the threshold (>=) for publishing a code suggestion both in a table and as commitable focus_only_on_problems=true # extra_instructions = "" enable_help_text=false enable_chat_text=false persistent_comment=true max_history_len=4 publish_output_no_suggestions=true # suggestions scoring suggestions_score_threshold=0 # [0-10]| recommend not to set this value above 8, since above it may clip highly relevant suggestions new_score_mechanism=true new_score_mechanism_th_high=9 new_score_mechanism_th_medium=7 # params for '/improve --extended' mode auto_extended_mode=true num_code_suggestions_per_chunk=3 max_number_of_calls = 3 parallel_calls = true final_clip_factor = 0.8 decouple_hunks = false # self-review checkbox demand_code_suggestions_self_review=false # add a checkbox for the author to self-review the code suggestions code_suggestions_self_review_text= "**Author self-review**: I have reviewed the PR code suggestions, and addressed the relevant ones." approve_pr_on_self_review=false # if true, the PR will be auto-approved after the author clicks on the self-review checkbox fold_suggestions_on_self_review=true # if true, the code suggestions will be folded after the author clicks on the self-review checkbox [pr_custom_prompt] # /custom_prompt # prompt = """\ The code suggestions should focus only on the following: - ... - ... ... """ suggestions_score_threshold=0 num_code_suggestions_per_chunk=3 self_reflect_on_custom_suggestions=true enable_help_text=false [pr_add_docs] # /add_docs # extra_instructions = "" docs_style = "Sphinx" # "Google Style with Args, Returns, Attributes...etc", "Numpy Style", "Sphinx Style", "PEP257", "reStructuredText" file = "" # in case there are several components with the same name, you can specify the relevant file class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name [pr_update_changelog] # /update_changelog # push_changelog_changes=false extra_instructions = "" add_pr_link=true skip_ci_on_push=true [pr_analyze] # /analyze # enable_help_text=true [pr_test] # /test # extra_instructions = "" testing_framework = "" # specify the testing framework you want to use num_tests=3 # number of tests to generate. max 5. avoid_mocks=true # if true, the generated tests will prefer to use real objects instead of mocks file = "" # in case there are several components with the same name, you can specify the relevant file class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name enable_help_text=false [pr_improve_component] # /improve_component # num_code_suggestions=4 extra_instructions = "" file = "" # in case there are several components with the same name, you can specify the relevant file class_name = "" # in case there are several methods with the same name in the same file, you can specify the relevant class name [pr_help] # /help # force_local_db=false num_retrieved_snippets=5 [pr_config] # /config # [pr_help_docs] repo_url = "" #If not overwritten, will use the repo from where the context came from (issue or PR) repo_default_branch = "main" docs_path = "docs" exclude_root_readme = false supported_doc_exts = [".md", ".mdx", ".rst"] enable_help_text=false [github] # The type of deployment to create. Valid values are 'app' or 'user'. deployment_type = "user" ratelimit_retries = 5 base_url = "https://api.github.com" publish_inline_comments_fallback_with_verification = true try_fix_invalid_inline_comments = true app_name = "pr-agent" ignore_bot_pr = true [github_action_config] # auto_review = true # set as env var in .github/workflows/pr-agent.yaml # auto_describe = true # set as env var in .github/workflows/pr-agent.yaml # auto_improve = true # set as env var in .github/workflows/pr-agent.yaml # pr_actions = ['opened', 'reopened', 'ready_for_review', 'review_requested'] [github_app] # these toggles allows running the github app from custom deployments bot_user = "github-actions[bot]" override_deployment_type = true # settings for "pull_request" event handle_pr_actions = ['opened', 'reopened', 'ready_for_review'] pr_commands = [ "/describe --pr_description.final_update_message=false", "/review", "/improve", ] # settings for "pull_request" event with "synchronize" action - used to detect and handle push triggers for new commits handle_push_trigger = false push_trigger_ignore_bot_commits = true push_trigger_ignore_merge_commits = true push_trigger_wait_for_initial_review = true push_trigger_pending_tasks_backlog = true push_trigger_pending_tasks_ttl = 300 push_commands = [ "/describe", "/review", ] [gitlab] url = "https://gitlab.com" expand_submodule_diffs = false pr_commands = [ "/describe --pr_description.final_update_message=false", "/review", "/improve", ] handle_push_trigger = false push_commands = [ "/describe", "/review", ] # Configure SSL validation for GitLab. Can be either set to the path of a custom CA or disabled entirely. # ssl_verify = true [gitea] url = "https://gitea.com" handle_push_trigger = false pr_commands = [ "/describe", "/review", "/improve", ] push_commands = [ "/describe", "/review", ] [bitbucket_app] pr_commands = [ "/describe --pr_description.final_update_message=false", "/review", "/improve --pr_code_suggestions.commitable_code_suggestions=true", ] avoid_full_files = false [local] # LocalGitProvider settings - uncomment to use paths other than default # description_path= "path/to/description.md" # review_path= "path/to/review.md" [gerrit] # endpoint to the gerrit service # url = "ssh://gerrit.example.com:29418" # user for gerrit authentication # user = "ai-reviewer" # patch server where patches will be saved # patch_server_endpoint = "http://127.0.0.1:5000/patch" # token to authenticate in the patch server # patch_server_token = "" [bitbucket_server] # URL to the BitBucket Server instance # url = "https://git.bitbucket.com" url = "" pr_commands = [ "/describe --pr_description.final_update_message=false", "/review", "/improve --pr_code_suggestions.commitable_code_suggestions=true", ] [litellm] # use_client = false # drop_params = false enable_callbacks = false success_callback = [] failure_callback = [] service_callback = [] # model_id = "" # Optional: Custom inference profile ID for Amazon Bedrock [pr_similar_issue] skip_comments = false force_update_dataset = false max_issues_to_scan = 500 vectordb = "pinecone" # options: "pinecone", "lancedb", "qdrant" [pr_find_similar_component] class_name = "" file = "" search_from_org = false allow_fallback_less_words = true number_of_keywords = 5 number_of_results = 5 [pinecone] # fill and place in .secrets.toml #api_key = ... # environment = "gcp-starter" [lancedb] uri = "./lancedb" [qdrant] # fill and place credentials in .secrets.toml # url = "https://YOUR-QDRANT-URL" # api_key = "..." [best_practices] content = "" organization_name = "" max_lines_allowed = 800 enable_global_best_practices = false [auto_best_practices] enable_auto_best_practices = true # public - general flag to disable all auto best practices usage utilize_auto_best_practices = true # public - disable usage of auto best practices in the 'improve' tool extra_instructions = "" # public - extra instructions to the auto best practices generation prompt content = "" max_patterns = 5 # max number of patterns to be detected [azure_devops] default_comment_status = "closed" [azure_devops_server] pr_commands = [ "/describe", "/review", "/improve", ] ================================================ FILE: pr_agent/settings/custom_labels.toml ================================================ [config] enable_custom_labels=false ## template for custom labels #[custom_labels."Bug fix"] #description = """Fixes a bug in the code""" #[custom_labels."Tests"] #description = """Adds or modifies tests""" #[custom_labels."Bug fix with tests"] #description = """Fixes a bug in the code and adds or modifies tests""" #[custom_labels."Enhancement"] #description = """Adds new features or modifies existing ones""" #[custom_labels."Documentation"] #description = """Adds or modifies documentation""" #[custom_labels."Other"] #description = """Other changes that do not fit in any of the above categories""" ================================================ FILE: pr_agent/settings/generated_code_ignore.toml ================================================ [generated_code] # Protocol Buffers protobuf = [ "**/*.pb.go", "**/*.pb.cc", "**/*_pb2.py", "**/*.pb.swift", "**/*.pb.rb", "**/*.pb.php", "**/*.pb.h" ] # OpenAPI / Swagger stubs openapi = [ "**/__generated__/**", "**/openapi_client/**", "**/openapi_server/**" ] swagger = [ "**/swagger.json", "**/swagger.yaml" ] # GraphQL codegen graphql = [ "**/*.graphql.ts", "**/*.generated.ts", "**/*.graphql.js" ] # RPC / gRPC Generators grpc_python = ["**/*_grpc.py"] grpc_java = ["**/*Grpc.java"] grpc_csharp = ["**/*Grpc.cs"] grpc_typescript = ["**/*_grpc.ts", "**/*_grpc.js"] # Go code generators go_gen = [ "**/*_gen.go", "**/*generated.go" ] ================================================ FILE: pr_agent/settings/ignore.toml ================================================ [ignore] glob = [ # Ignore files and directories matching these glob patterns. # See https://docs.python.org/3/library/glob.html 'vendor/**', ] regex = [ # Ignore files and directories matching these regex patterns. # See https://learnbyexample.github.io/python-regex-cheatsheet/ # for example: regex = ['.*\.toml$'] ] ================================================ FILE: pr_agent/settings/language_extensions.toml ================================================ [bad_extensions] default = [ 'app', 'bin', 'bmp', 'bz2', 'class', 'csv', 'dat', 'db', 'dll', 'dylib', 'egg', 'eot', 'exe', 'gif', 'gitignore', 'glif', 'gradle', 'gz', 'ico', 'jar', 'jpeg', 'jpg', 'lo', 'lock', 'log', 'mp3', 'mp4', 'nar', 'o', 'ogg', 'otf', 'p', 'pdf', 'png', 'pickle', 'pkl', 'pyc', 'pyd', 'pyo', 'rkt', 'so', 'ss', 'svg', 'tar', 'tgz', 'tsv', 'ttf', 'war', 'webm', 'woff', 'woff2', 'xz', 'zip', 'zst', 'snap', 'lockb' ] extra = [ 'md', 'txt' ] [language_extension_map_org] "1C Enterprise" = ["*.bsl", ] ABAP = [".abap", ] "AGS Script" = [".ash", ] AMPL = [".ampl", ] ANTLR = [".g4", ] "API Blueprint" = [".apib", ] APL = [".apl", ".dyalog", ] ASP = [".asp", ".asax", ".ascx", ".ashx", ".asmx", ".aspx", ".axd", ] ATS = [".dats", ".hats", ".sats", ] ActionScript = [".as", ] Ada = [".adb", ".ada", ".ads", ] Agda = [".agda", ] Alloy = [".als", ] ApacheConf = [".apacheconf", ".vhost", ] AppleScript = [".applescript", ".scpt", ] Arc = [".arc", ] Arduino = [".ino", ] AsciiDoc = [".asciidoc", ".adoc", ] AspectJ = [".aj", ] Assembly = [".asm", ".a51", ".nasm", ] Augeas = [".aug", ] AutoHotkey = [".ahk", ".ahkl", ] AutoIt = [".au3", ] Awk = [".awk", ".auk", ".gawk", ".mawk", ".nawk", ] Batchfile = [".bat", ".cmd", ] Befunge = [".befunge", ] Bison = [".bison", ] BitBake = [".bb", ] BlitzBasic = [".decls", ] BlitzMax = [".bmx", ] Bluespec = [".bsv", ] Boo = [".boo", ] Brainfuck = [".bf", ] Brightscript = [".brs", ] Bro = [".bro", ] C = [".c", ".cats", ".h", ".idc", ".w", ] "C#" = [".cs", ".cake", ".cshtml", ".csx", ] "C++" = [".cpp", ".c++", ".cc", ".cp", ".cxx", ".h++", ".hh", ".hpp", ".hxx", ".inl", ".ipp", ".tcc", ".tpp", ".C", ".H", ] C-ObjDump = [".c-objdump", ] "C2hs Haskell" = [".chs", ] CLIPS = [".clp", ] CMake = [".cmake", ".cmake.in", ] COBOL = [".cob", ".cbl", ".ccp", ".cobol", ".cpy", ] CSS = [".css", ] CSV = [".csv", ] "Cap'n Proto" = [".capnp", ] CartoCSS = [".mss", ] Ceylon = [".ceylon", ] Chapel = [".chpl", ] ChucK = [".ck", ] Cirru = [".cirru", ] Clarion = [".clw", ] Clean = [".icl", ".dcl", ] Click = [".click", ] Clojure = [".clj", ".boot", ".cl2", ".cljc", ".cljs", ".cljs.hl", ".cljscm", ".cljx", ".hic", ] CoffeeScript = [".coffee", "._coffee", ".cjsx", ".cson", ".iced", ] ColdFusion = [".cfm", ".cfml", ] "ColdFusion CFC" = [".cfc", ] "Common Lisp" = [".lisp", ".asd", ".lsp", ".ny", ".podsl", ".sexp", ] "Component Pascal" = [".cps", ] Coq = [".coq", ] Cpp-ObjDump = [".cppobjdump", ".c++-objdump", ".c++objdump", ".cpp-objdump", ".cxx-objdump", ] Creole = [".creole", ] Crystal = [".cr", ] Csound = [".csd", ] Cucumber = [".feature", ] Cuda = [".cu", ".cuh", ] Cycript = [".cy", ] Cython = [".pyx", ".pxd", ".pxi", ] D = [".di", ] D-ObjDump = [".d-objdump", ] "DIGITAL Command Language" = [".com", ] DM = [".dm", ] "DNS Zone" = [".zone", ".arpa", ] "Darcs Patch" = [".darcspatch", ".dpatch", ] Dart = [".dart", ] Diff = [".diff", ".patch", ] Dockerfile = [".dockerfile", "Dockerfile", ] Dogescript = [".djs", ] Dylan = [".dylan", ".dyl", ".intr", ".lid", ] E = [".E", ] ECL = [".ecl", ".eclxml", ] Eagle = [".sch", ".brd", ] "Ecere Projects" = [".epj", ] Eiffel = [".e", ] Elixir = [".ex", ".exs", ] Elm = [".elm", ] "Emacs Lisp" = [".el", ".emacs", ".emacs.desktop", ] EmberScript = [".em", ".emberscript", ] Erlang = [".erl", ".escript", ".hrl", ".xrl", ".yrl", ] "F#" = [".fs", ".fsi", ".fsx", ] FLUX = [".flux", ] FORTRAN = [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp", ] Factor = [".factor", ] Fancy = [".fy", ".fancypack", ] Fantom = [".fan", ] Formatted = [".eam.fs", ] Forth = [".fth", ".4th", ".forth", ".frt", ] FreeMarker = [".ftl", ] G-code = [".g", ".gco", ".gcode", ] GAMS = [".gms", ] GAP = [".gap", ".gi", ] GAS = [".s", ] GDScript = [".gd", ] GLSL = [".glsl", ".fp", ".frag", ".frg", ".fsh", ".fshader", ".geo", ".geom", ".glslv", ".gshader", ".shader", ".vert", ".vrx", ".vsh", ".vshader", ] Genshi = [".kid", ] "Gentoo Ebuild" = [".ebuild", ] "Gentoo Eclass" = [".eclass", ] "Gettext Catalog" = [".po", ".pot", ] Glyph = [".glf", ] Gnuplot = [".gp", ".gnu", ".gnuplot", ".plot", ".plt", ] Go = [".go", ] Golo = [".golo", ] Gosu = [".gst", ".gsx", ".vark", ] Grace = [".grace", ] Gradle = [".gradle", ] "Grammatical Framework" = [".gf", ] GraphQL = [".graphql", ] "Graphviz (DOT)" = [".dot", ".gv", ] Groff = [".man", ".1", ".1in", ".1m", ".1x", ".2", ".3", ".3in", ".3m", ".3qt", ".3x", ".4", ".5", ".6", ".7", ".8", ".9", ".me", ".rno", ".roff", ] Groovy = [".groovy", ".grt", ".gtpl", ".gvy", ] "Groovy Server Pages" = [".gsp", ] HCL = [".hcl", ".tf", ] HLSL = [".hlsl", ".fxh", ".hlsli", ] HTML = [".html", ".htm", ".html.hl", ".xht", ".xhtml", ] "HTML+Django" = [".mustache", ".jinja", ] "HTML+EEX" = [".eex", ] "HTML+ERB" = [".erb", ".erb.deface", ] "HTML+PHP" = [".phtml", ] HTTP = [".http", ] Haml = [".haml", ".haml.deface", ] Handlebars = [".handlebars", ".hbs", ] Harbour = [".hb", ] Haskell = [".hs", ".hsc", ] Haxe = [".hx", ".hxsl", ] Hy = [".hy", ] IDL = [".dlm", ] "IGOR Pro" = [".ipf", ] INI = [".ini", ".cfg", ".prefs", ".properties", ] "IRC log" = [".irclog", ".weechatlog", ] Idris = [".idr", ".lidr", ] "Inform 7" = [".ni", ".i7x", ] "Inno Setup" = [".iss", ] Io = [".io", ] Ioke = [".ik", ] Isabelle = [".thy", ] J = [".ijs", ] JFlex = [".flex", ".jflex", ] JSON = [".json", ".geojson", ".lock", ".topojson", ] JSON5 = [".json5", ] JSONLD = [".jsonld", ] JSONiq = [".jq", ] JSX = [".jsx", ] Jade = [".jade", ] Jasmin = [".j", ] Java = [".java", ] "Java Server Pages" = [".jsp", ] JavaScript = [".js", "._js", ".bones", ".es6", ".jake", ".jsb", ".jscad", ".jsfl", ".jsm", ".jss", ".njs", ".pac", ".sjs", ".ssjs", ".xsjs", ".xsjslib", ] Julia = [".jl", ] "Jupyter Notebook" = [".ipynb", ] KRL = [".krl", ] KiCad = [".kicad_pcb", ] Kit = [".kit", ] Kotlin = [".kt", ".ktm", ".kts", ] LFE = [".lfe", ] LLVM = [".ll", ] LOLCODE = [".lol", ] LSL = [".lsl", ".lslp", ] LabVIEW = [".lvproj", ] Lasso = [".lasso", ".las", ".lasso8", ".lasso9", ".ldml", ] Latte = [".latte", ] Lean = [".lean", ".hlean", ] Less = [".less", ] Lex = [".lex", ] LilyPond = [".ly", ".ily", ] "Linker Script" = [".ld", ".lds", ] Liquid = [".liquid", ] "Literate Agda" = [".lagda", ] "Literate CoffeeScript" = [".litcoffee", ] "Literate Haskell" = [".lhs", ] LiveScript = [".ls", "._ls", ] Logos = [".xm", ".x", ".xi", ] Logtalk = [".lgt", ".logtalk", ] LookML = [".lookml", ] Lua = [".lua", ".nse", ".pd_lua", ".rbxs", ".wlua", ] M = [".mumps", ] M4 = [".m4", ] MAXScript = [".mcr", ] MTML = [".mtml", ] MUF = [".muf", ] Makefile = [".mak", ".mk", ".mkfile", "Makefile", ] Mako = [".mako", ".mao", ] Maple = [".mpl", ] Markdown = [".md", ".markdown", ".mkd", ".mkdn", ".mkdown", ".ron", ] Mask = [".mask", ] Mathematica = [".mathematica", ".cdf", ".ma", ".mt", ".nb", ".nbp", ".wl", ".wlt", ] Matlab = [".matlab", ] Max = [".maxpat", ".maxhelp", ".maxproj", ".mxt", ".pat", ] MediaWiki = [".mediawiki", ".wiki", ] Metal = [".metal", ] MiniD = [".minid", ] Mirah = [".druby", ".duby", ".mir", ".mirah", ] Modelica = [".mo", ] "Module Management System" = [".mms", ".mmk", ] Monkey = [".monkey", ] MoonScript = [".moon", ] Myghty = [".myt", ] NSIS = [".nsi", ".nsh", ] NetLinx = [".axs", ".axi", ] "NetLinx+ERB" = [".axs.erb", ".axi.erb", ] NetLogo = [".nlogo", ] Nginx = [".nginxconf", ] Nimrod = [".nim", ".nimrod", ] Ninja = [".ninja", ] Nit = [".nit", ] Nix = [".nix", ] Nu = [".nu", ] NumPy = [".numpy", ".numpyw", ".numsc", ] OCaml = [".ml", ".eliom", ".eliomi", ".ml4", ".mli", ".mll", ".mly", ] ObjDump = [".objdump", ] "Objective-C++" = [".mm", ] Objective-J = [".sj", ] Octave = [".oct", ] Omgrofl = [".omgrofl", ] Opa = [".opa", ] Opal = [".opal", ] OpenCL = [".cl", ".opencl", ] "OpenEdge ABL" = [".p", ] OpenSCAD = [".scad", ] Org = [".org", ] Ox = [".ox", ".oxh", ".oxo", ] Oxygene = [".oxygene", ] Oz = [".oz", ] PAWN = [".pwn", ] PHP = [".php", ".aw", ".ctp", ".php3", ".php4", ".php5", ".phps", ".phpt", ] "POV-Ray SDL" = [".pov", ] Pan = [".pan", ] Papyrus = [".psc", ] Parrot = [".parrot", ] "Parrot Assembly" = [".pasm", ] "Parrot Internal Representation" = [".pir", ] Pascal = [".pas", ".dfm", ".dpr", ".lpr", ] Perl = [".pl", ".al", ".perl", ".ph", ".plx", ".pm", ".psgi", ".t", ] Perl6 = [".6pl", ".6pm", ".nqp", ".p6", ".p6l", ".p6m", ".pl6", ".pm6", ] Pickle = [".pkl", ] PigLatin = [".pig", ] Pike = [".pike", ".pmod", ] Pod = [".pod", ] PogoScript = [".pogo", ] Pony = [".pony", ] PostScript = [".ps", ".eps", ] PowerShell = [".ps1", ".psd1", ".psm1", ] Processing = [".pde", ] Prolog = [".prolog", ".yap", ] "Propeller Spin" = [".spin", ] "Protocol Buffer" = [".proto", ] "Public Key" = [".pub", ] "Pure Data" = [".pd", ] PureBasic = [".pb", ".pbi", ] PureScript = [".purs", ] Python = [".py", ".bzl", ".gyp", ".lmi", ".pyde", ".pyp", ".pyt", ".pyw", ".tac", ".wsgi", ".xpy", ] "Python traceback" = [".pytb", ] QML = [".qml", ".qbs", ] QMake = [".pri", ] R = [".r", ".rd", ".rsx", ] RAML = [".raml", ] RDoc = [".rdoc", ] REALbasic = [".rbbas", ".rbfrm", ".rbmnu", ".rbres", ".rbtbar", ".rbuistate", ] RHTML = [".rhtml", ] RMarkdown = [".rmd", ] Racket = [".rkt", ".rktd", ".rktl", ".scrbl", ] "Ragel in Ruby Host" = [".rl", ] "Raw token data" = [".raw", ] Rebol = [".reb", ".r2", ".r3", ".rebol", ] Red = [".red", ".reds", ] Redcode = [".cw", ] "Ren'Py" = [".rpy", ] RenderScript = [".rsh", ] RobotFramework = [".robot", ] Rouge = [".rg", ] Ruby = [".rb", ".builder", ".gemspec", ".god", ".irbrc", ".jbuilder", ".mspec", ".podspec", ".rabl", ".rake", ".rbuild", ".rbw", ".rbx", ".ru", ".ruby", ".thor", ".watchr", ] Rust = [".rs", ".rs.in", ] SAS = [".sas", ] SCSS = [".scss", ] SMT = [".smt2", ".smt", ] SPARQL = [".sparql", ".rq", ] SQF = [".sqf", ".hqf", ] SQL = [".pls", ".pck", ".pkb", ".pks", ".plb", ".plsql", ".sql", ".cql", ".ddl", ".prc", ".tab", ".udf", ".viw", ".db2", ] STON = [".ston", ] SVG = [".svg", ] Sage = [".sage", ".sagews", ] SaltStack = [".sls", ] Sass = [".sass", ] Scala = [".scala", ".sbt", ] Scaml = [".scaml", ] Scheme = [".scm", ".sld", ".sps", ".ss", ] Scilab = [".sci", ".sce", ] Self = [".self", ] Shell = [".sh", ".bash", ".bats", ".command", ".ksh", ".sh.in", ".tmux", ".tool", ".zsh", ] ShellSession = [".sh-session", ] Shen = [".shen", ] Slash = [".sl", ] Slim = [".slim", ] Smali = [".smali", ] Smalltalk = [".st", ] Smarty = [".tpl", ] Solidity = [".sol", ] SourcePawn = [".sp", ".sma", ] Squirrel = [".nut", ] Stan = [".stan", ] "Standard ML" = [".ML", ".fun", ".sig", ".sml", ] Stata = [".do", ".ado", ".doh", ".ihlp", ".mata", ".matah", ".sthlp", ] Stylus = [".styl", ] SuperCollider = [".scd", ] Swift = [".swift", ] SystemVerilog = [".sv", ".svh", ".vh", ] TOML = [".toml", ] TXL = [".txl", ] Tcl = [".tcl", ".adp", ".tm", ] Tcsh = [".tcsh", ".csh", ] TeX = [".tex", ".aux", ".bbx", ".bib", ".cbx", ".dtx", ".ins", ".lbx", ".ltx", ".mkii", ".mkiv", ".mkvi", ".sty", ".toc", ] Tea = [".tea", ] Text = [".txt", ".no", ] Textile = [".textile", ] Thrift = [".thrift", ] Turing = [".tu", ] Turtle = [".ttl", ] Twig = [".twig", ] TypeScript = [".ts", ".tsx", ] "Unified Parallel C" = [".upc", ] "Unity3D Asset" = [".anim", ".asset", ".mat", ".meta", ".prefab", ".unity", ] Uno = [".uno", ] UnrealScript = [".uc", ] UrWeb = [".ur", ".urs", ] VCL = [".vcl", ] VHDL = [".vhdl", ".vhd", ".vhf", ".vhi", ".vho", ".vhs", ".vht", ".vhw", ] Vala = [".vala", ".vapi", ] Verilog = [".veo", ] VimL = [".vim", ] "Visual Basic" = [".vb", ".bas", ".frm", ".frx", ".vba", ".vbhtml", ".vbs", ] Volt = [".volt", ] Vue = [".vue", ] "Web Ontology Language" = [".owl", ] WebAssembly = [".wat", ] WebIDL = [".webidl", ] X10 = [".x10", ] XC = [".xc", ] XML = [".xml", ".ant", ".axml", ".ccxml", ".clixml", ".cproject", ".csl", ".csproj", ".ct", ".dita", ".ditamap", ".ditaval", ".dll.config", ".dotsettings", ".filters", ".fsproj", ".fxml", ".glade", ".grxml", ".iml", ".ivy", ".jelly", ".jsproj", ".kml", ".launch", ".mdpolicy", ".mxml", ".nproj", ".nuspec", ".odd", ".osm", ".plist", ".props", ".ps1xml", ".psc1", ".pt", ".rdf", ".rss", ".scxml", ".srdf", ".storyboard", ".stTheme", ".sublime-snippet", ".targets", ".tmCommand", ".tml", ".tmLanguage", ".tmPreferences", ".tmSnippet", ".tmTheme", ".ui", ".urdf", ".ux", ".vbproj", ".vcxproj", ".vssettings", ".vxml", ".wsdl", ".wsf", ".wxi", ".wxl", ".wxs", ".x3d", ".xacro", ".xaml", ".xib", ".xlf", ".xliff", ".xmi", ".xml.dist", ".xproj", ".xsd", ".xul", ".zcml", ] XPages = [".xsp-config", ".xsp.metadata", ] XProc = [".xpl", ".xproc", ] XQuery = [".xquery", ".xq", ".xql", ".xqm", ".xqy", ] XS = [".xs", ] XSLT = [".xslt", ".xsl", ] Xojo = [".xojo_code", ".xojo_menu", ".xojo_report", ".xojo_script", ".xojo_toolbar", ".xojo_window", ] Xtend = [".xtend", ] YAML = [".yml", ".reek", ".rviz", ".sublime-syntax", ".syntax", ".yaml", ".yaml-tmlanguage", ] YANG = [".yang", ] Yacc = [".y", ".yacc", ".yy", ] Zephir = [".zep", ] Zig = [".zig", ] Zimpl = [".zimpl", ".zmpl", ".zpl", ] desktop = [".desktop", ".desktop.in", ] eC = [".ec", ".eh", ] edn = [".edn", ] fish = [".fish", ] mupad = [".mu", ] nesC = [".nc", ] ooc = [".ooc", ] reStructuredText = [".rst", ".rest", ".rest.txt", ".rst.txt", ] wisp = [".wisp", ] xBase = [".prg", ".prw", ] [docs_blacklist_extensions] # Disable docs for these extensions of text files and scripts that are not programming languages of function, classes and methods docs_blacklist = ['sql', 'txt', 'yaml', 'json', 'xml', 'md', 'rst', 'rest', 'rest.txt', 'rst.txt', 'mdpolicy', 'mdown', 'markdown', 'mdwn', 'mkd', 'mkdn', 'mkdown', 'sh'] ================================================ FILE: pr_agent/settings/pr_add_docs.toml ================================================ [pr_add_docs_prompt] system="""You are PR-Doc, a language model that specializes in generating documentation for code components in a Pull Request (PR). Your task is to generate {{ docs_for_language }} for code components in the PR Diff. Example for the PR Diff format: ====== ## File: 'src/file1.py' @@ -12,3 +12,4 @@ def func1(): __new hunk__ 12 code line1 that remained unchanged in the PR 14 +new code line1 added in the PR 15 +new code line2 added in the PR 16 code line2 that remained unchanged in the PR __old hunk__ code line1 that remained unchanged in the PR -code line that was removed in the PR code line2 that remained unchanged in the PR @@ ... @@ def func2(): __new hunk__ ... __old hunk__ ... ## File: 'src/file2.py' ... ====== Specific instructions: - Try to identify edited/added code components (classes/functions/methods...) that are undocumented, and generate {{ docs_for_language }} for each one. - If there are documented (any type of {{ language }} documentation) code components in the PR, Don't generate {{ docs_for_language }} for them. - Ignore code components that don't appear fully in the '__new hunk__' section. For example, you must see the component header and body. - Make sure the {{ docs_for_language }} starts and ends with standard {{ language }} {{ docs_for_language }} signs. - The {{ docs_for_language }} should be in standard format. - Provide the exact line number (inclusive) where the {{ docs_for_language }} should be added. {%- if extra_instructions %} Extra instructions from the user: ====== {{ extra_instructions }} ====== {%- endif %} You must use the following YAML schema to format your answer: ```yaml Code Documentation: type: array uniqueItems: true items: relevant file: type: string description: The full file path of the relevant file. relevant line: type: integer description: |- The relevant line number from a '__new hunk__' section where the {{ docs_for_language }} should be added. doc placement: type: string enum: - before - after description: |- The {{ docs_for_language }} placement relative to the relevant line (code component). For example, in Python the docs are placed after the function signature, but in Java they are placed before. documentation: type: string description: |- The {{ docs_for_language }} content. It should be complete, correctly formatted and indented, and without line numbers. ``` Example output: ```yaml Code Documentation: - relevant file: |- src/file1.py relevant lines: 12 doc placement: after documentation: |- \"\"\" This is a python docstring for func1. \"\"\" - ... ... ``` Each YAML output MUST be after a newline, indented, with block scalar indicator ('|-'). Don't repeat the prompt in the answer, and avoid outputting the 'type' and 'description' fields. """ user="""PR Info: Title: '{{ title }}' Branch: '{{ branch }}' {%- if description %} Description: ====== {{ description|trim }} ====== {%- endif %} {%- if language %} Main PR language: '{{language}}' {%- endif %} The PR Diff: ====== {{ diff|trim }} ====== Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/pr_custom_labels.toml ================================================ [pr_custom_labels_prompt] system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR). Your task is to provide labels that describe the PR content. {%- if enable_custom_labels %} Thoroughly read the labels name and the provided description, and decide whether the label is relevant to the PR. {%- endif %} {%- if extra_instructions %} Extra instructions from the user: ====== {{ extra_instructions }} ====== {% endif %} The output must be a YAML object equivalent to type $Labels, according to the following Pydantic definitions: ====== {%- if enable_custom_labels %} {{ custom_labels_class }} {%- else %} class Label(str, Enum): bug_fix = "Bug fix" tests = "Tests" enhancement = "Enhancement" documentation = "Documentation" other = "Other" {%- endif %} class Labels(BaseModel): labels: List[Label] = Field(min_items=0, description="choose the relevant custom labels that describe the PR content, and return their keys. Use the value field of the Label object to better understand the label meaning.") ====== Example output: ```yaml labels: - ... - ... ``` Answer should be a valid YAML, and nothing else. """ user="""PR Info: Previous title: '{{title}}' Branch: '{{ branch }}' {%- if description %} Description: ====== {{ description|trim }} ====== {%- endif %} {%- if language %} Main PR language: '{{ language }}' {%- endif %} {%- if commit_messages_str %} Commit messages: ====== {{ commit_messages_str|trim }} ====== {%- endif %} The PR Git Diff: ====== {{ diff|trim }} ====== Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines. Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/pr_description_prompts.toml ================================================ [pr_description_prompt] system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR). Your task is to provide a full description for the PR content: type, description, title, and files walkthrough. - Focus on the new PR code (lines starting with '+' in the 'PR Git Diff' section). - Keep in mind that the 'Previous title', 'Previous description' and 'Commit messages' sections may be partial, simplistic, non-informative or out of date. Hence, compare them to the PR diff code, and use them only as a reference. - The generated title and description should prioritize the most significant changes. - If needed, each YAML output should be in block scalar indicator ('|') - When quoting variables, names or file paths from the code, use backticks (`) instead of single quote ('). - When needed, use '- ' as bullets {%- if extra_instructions %} Extra instructions from the user: ===== {{extra_instructions}} ===== {% endif %} The output must be a YAML object equivalent to type $PRDescription, according to the following Pydantic definitions: ===== class PRType(str, Enum): bug_fix = "Bug fix" tests = "Tests" enhancement = "Enhancement" documentation = "Documentation" other = "Other" {%- if enable_custom_labels %} {{ custom_labels_class }} {%- endif %} {%- if enable_semantic_files_types %} class FileDescription(BaseModel): filename: str = Field(description="The full file path of the relevant file") {%- if include_file_summary_changes %} changes_summary: str = Field(description="concise summary of the changes in the relevant file, in bullet points (1-4 bullet points).") {%- endif %} changes_title: str = Field(description="one-line summary (5-10 words) capturing the main theme of changes in the file") label: str = Field(description="a single semantic label that represents a type of code changes that occurred in the File. Possible values (partial list): 'bug fix', 'tests', 'enhancement', 'documentation', 'error handling', 'configuration changes', 'dependencies', 'formatting', 'miscellaneous', ...") {%- endif %} class PRDescription(BaseModel): type: List[PRType] = Field(description="one or more types that describe the PR content. Return the label member value (e.g. 'Bug fix', not 'bug_fix')") description: str = Field(description="summarize the PR changes with 1-4 bullet points, each up to 8 words. For large PRs, add sub-bullets for each bullet if needed. Order bullets by importance, with each bullet highlighting a key change group.") title: str = Field(description="a concise and descriptive title that captures the PR's main theme") {%- if enable_pr_diagram %} changes_diagram: str = Field(description='a horizontal diagram that represents the main PR changes, in the format of a valid mermaid LR flowchart. The diagram should be concise and easy to read. Leave empty if no diagram is relevant. To create robust Mermaid diagrams, follow this two-step process: (1) Declare the nodes: nodeID["node description"]. (2) Then define the links: nodeID1 -- "link text" --> nodeID2. Node description must always be surrounded with double quotation marks') '{%- endif %} {%- if enable_semantic_files_types %} pr_files: List[FileDescription] = Field(max_items=20, description="a list of all the files that were changed in the PR, and summary of their changes. Each file must be analyzed regardless of change size.") {%- endif %} ===== Example output: ```yaml type: - ... - ... description: | - ... - ... title: | ... {%- if enable_pr_diagram %} changes_diagram: | ```mermaid flowchart LR ... ``` {%- endif %} {%- if enable_semantic_files_types %} pr_files: - filename: | ... {%- if include_file_summary_changes %} changes_summary: | ... {%- endif %} changes_title: | ... label: | label_key_1 ... {%- endif %} ``` Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|') """ user=""" {%- if related_tickets %} Related Ticket Info: {% for ticket in related_tickets %} ===== Ticket Title: '{{ ticket.title }}' {%- if ticket.labels %} Ticket Labels: {{ ticket.labels }} {%- endif %} {%- if ticket.body %} Ticket Description: ##### {{ ticket.body }} ##### {%- endif %} ===== {% endfor %} {%- endif %} PR Info: Previous title: '{{title}}' {%- if description %} Previous description: ===== {{ description|trim }} ===== {%- endif %} Branch: '{{branch}}' {%- if commit_messages_str %} Commit messages: ===== {{ commit_messages_str|trim }} ===== {%- endif %} The PR Git Diff: ===== {{ diff|trim }} ===== Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines. {%- if duplicate_prompt_examples %} Example output: ```yaml type: - Bug fix - Refactoring - ... description: | - ... - ... title: | ... {%- if enable_pr_diagram %} changes_diagram: | ```mermaid flowchart LR ... ``` {%- endif %} {%- if enable_semantic_files_types %} pr_files: - filename: | ... {%- if include_file_summary_changes %} changes_summary: | ... {%- endif %} changes_title: | ... label: | label_key_1 ... {%- endif %} ``` (replace '...' with the actual values) {%- endif %} Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/pr_evaluate_prompt_response.toml ================================================ [pr_evaluate_prompt] prompt="""\ You are the PR-task-evaluator, a language model that compares and ranks the quality of two responses provided in response to a lengthy task regarding a Pull Request (PR) code diff. The task to be evaluated is: ***** Start of Task ***** {{pr_task|trim}} ***** End of Task ***** Response 1 to the task is: ***** Start of Response 1 ***** {{pr_response1|trim}} ***** End of Response 1 ***** Response 2 to the task is: ***** Start of Response 2 ***** {{pr_response2|trim}} ***** End of Response 2 ***** Guidelines to evaluate the responses: - Thoroughly read the 'Task' part. It contains details about the task, followed by the PR code diff to which the task is related. - Thoroughly read 'Response1' and 'Response2' parts. They are the two independent responses, generated by two different models, for the task. After that, rank each response. Criterions to rank each response: - How well does the response follow the specific task instructions and requirements? - How well does the response analyze and understand the PR code diff? - How well will a person perceive it as a good response that correctly addresses the task? - How well does the response prioritize key feedback, related to the task instructions, that a human reader seeing that feedback would also consider as important? - Don't necessarily rank higher a response that is longer. A shorter response might be better if it is more concise, and still addresses the task better. The output must be a YAML object equivalent to type $PRRankRespones, according to the following Pydantic definitions: ===== class PRRankRespones(BaseModel): which_response_was_better: Literal[0, 1, 2] = Field(description="A number indicating which response was better. 0 means both responses are equally good.") why: str = Field(description="In a short and concise manner, explain why the chosen response is better than the other. Be specific and give examples if relevant.") score_response1: int = Field(description="A score between 1 and 10, indicating the quality of the response1, based on the criterions mentioned in the prompt.") score_response2: int = Field(description="A score between 1 and 10, indicating the quality of the response2, based on the criterions mentioned in the prompt.") ===== Example output: ```yaml which_response_was_better: "X" why: "Response X is better because it is more practical, and addresses the task requirements better since ..." score_response1: ... score_response2: ... ``` Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/pr_help_docs_headings_prompts.toml ================================================ [pr_help_docs_headings_prompts] system="""You are Doc-helper, a language model that ranks documentation files based on their relevance to user questions. You will receive a question, a repository url and file names along with optional groups of headings extracted from such files from that repository (either as markdown or as restructred text). Your task is to rank file paths based on how likely they contain the answer to a user's question, using only the headings from each such file and the file name. ====== ==file name== 'src/file1.py' ==index== 0 based integer ==file headings== heading #1 heading #2 ... ==file name== 'src/file2.py' ==index== 0 based integer ==file headings== heading #1 heading #2 ... ... ====== Additional instructions: - Consider only the file names and section headings within each document - Present the most relevant files first, based strictly on how well their headings and file names align with user question The output must be a YAML object equivalent to type $DocHeadingsHelper, according to the following Pydantic definitions: ===== class file_idx_and_path(BaseModel): idx: int = Field(description="The zero based index of file_name, as it appeared in the original list of headings. Cannot be negative.") file_name: str = Field(description="The file_name exactly as it appeared in the question") class DocHeadingsHelper(BaseModel): user_question: str = Field(description="The user's question") relevant_files_ranking: List[file_idx_and_path] = Field(description="Files sorted in descending order by relevance to question") ===== Example output: ```yaml user_question: | ... relevant_files_ranking: - idx: 101 file_name: "src/file1.py" - ... """ user="""\ Documentation url: '{{ docs_url|trim }}' ----- User's Question: ===== {{ question|trim }} ===== Filenames with optional headings from documentation website content: ===== {{ snippets|trim }} ===== Reminder: The output must be a YAML object equivalent to type $DocHeadingsHelper, similar to the following example output: ===== Example output: ```yaml user_question: | ... relevant_files_ranking: - idx: 101 file_name: "src/file1.py" - ... ===== Important Notes: 1. Output most relevant file names first, by descending order of relevancy. 2. Only include files with non-negative indices Response (should be a valid YAML, and nothing else). ```yaml """ ================================================ FILE: pr_agent/settings/pr_help_docs_prompts.toml ================================================ [pr_help_docs_prompts] system="""You are Doc-helper, a language model designed to answer questions about a documentation website for a given repository. You will receive a question, a repository url and the full documentation content for that repository (either as markdown or as restructred text). Your goal is to provide the best answer to the question using the documentation provided. Additional instructions: - Be short and concise in your answers. Give examples if needed. - Answer only questions that are related to the documentation website content. If the question is completely unrelated to the documentation, return an empty response. The output must be a YAML object equivalent to type $DocHelper, according to the following Pydantic definitions: ===== class relevant_section(BaseModel): file_name: str = Field(description="The name of the relevant file") relevant_section_header_string: str = Field(description="The exact text of the relevant markdown/restructured text section heading from the relevant file (starting with '#', '##', etc.). Return empty string if the entire file is the relevant section, or if the relevant section has no heading") class DocHelper(BaseModel): user_question: str = Field(description="The user's question") response: str = Field(description="The response to the user's question") relevant_sections: List[relevant_section] = Field(description="A list of the relevant markdown/restructured text sections in the documentation that answer the user's question, ordered by importance (most relevant first)") question_is_relevant: int = Field(description="Return 1 if the question is somewhat relevant to documentation. 0 - otherwise") ===== Example output: ```yaml user_question: | ... response: | ... relevant_sections: - file_name: "src/file1.py" relevant_section_header_string: | ... - ... question_is_relevant: | 1 """ user="""\ Documentation url: '{{ docs_url| trim }}' ----- User's Question: ===== {{ question|trim }} ===== Documentation website content: ===== {{ snippets|trim }} ===== Reminder: The output must be a YAML object equivalent to type $DocHelper, similar to the following example output: ===== Example output: ```yaml user_question: | ... response: | ... relevant_sections: - file_name: "src/file1.py" relevant_section_header_string: | ... - ... question_is_relevant: | 1 ===== Response (should be a valid YAML, and nothing else). ```yaml """ ================================================ FILE: pr_agent/settings/pr_help_prompts.toml ================================================ [pr_help_prompts] system="""You are Doc-helper, a language models designed to answer questions about a documentation website for an open-soure project called "PR-Agent" (recently renamed to "Qodo Merge"). You will receive a question, and the full documentation website content. Your goal is to provide the best answer to the question using the documentation provided. Additional instructions: - Try to be short and concise in your answers. Try to give examples if needed. - The main tools of PR-Agent are 'describe', 'review', 'improve'. If there is ambiguity to which tool the user is referring to, prioritize snippets of these tools over others. - If the question has ambiguity and can relate to different tools or platforms, provide the best answer possible based on what is available, but also state in your answer what additional information would be needed to give a more accurate answer. The output must be a YAML object equivalent to type $DocHelper, according to the following Pydantic definitions: ===== class relevant_section(BaseModel): file_name: str = Field(description="The name of the relevant file") relevant_section_header_string: str = Field(description="The exact text of the relevant markdown section heading from the relevant file (starting with '#', '##', etc.). Return empty string if the entire file is the relevant section, or if the relevant section has no heading") class DocHelper(BaseModel): user_question: str = Field(description="The user's question") response: str = Field(description="The response to the user's question") relevant_sections: List[relevant_section] = Field(description="A list of the relevant markdown sections in the documentation that answer the user's question, ordered by importance (most relevant first)") ===== Example output: ```yaml user_question: | ... response: | ... relevant_sections: - file_name: "src/file1.py" relevant_section_header_string: | ... - ... """ user="""\ User's Question: ===== {{ question|trim }} ===== Documentation website content: ===== {{ snippets|trim }} ===== Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/pr_information_from_user_prompts.toml ================================================ [pr_information_from_user_prompt] system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR). Given the PR Info and the PR Git Diff, generate 3 short questions about the PR code for the PR author. The goal of the questions is to help the language model understand the PR better, so the questions should be insightful, informative, non-trivial, and relevant to the PR. You should prefer asking yes/no questions, or multiple choice questions. Also add at least one open-ended question, but make sure they are not too difficult, and can be answered in a sentence or two. Example output: ' Questions to better understand the PR: 1) ... 2) ... ... ' """ user="""PR Info: Title: '{{title}}' Branch: '{{branch}}' {%- if description %} Description: ====== {{ description|trim }} ====== {%- endif %} {%- if language %} Main PR language: '{{ language }}' {%- endif %} {%- if commit_messages_str %} Commit messages: ====== {{ commit_messages_str|trim }} ====== {%- endif %} The PR Git Diff: ====== {{ diff|trim }} ====== Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines Response: """ ================================================ FILE: pr_agent/settings/pr_line_questions_prompts.toml ================================================ [pr_line_questions_prompt] system="""You are PR-Reviewer, a language model designed to answer questions about a Git Pull Request (PR). Your goal is to answer questions\\tasks about specific lines of code in the PR, and provide feedback. Be informative, constructive, and give examples. Try to be as specific as possible. Don't avoid answering the questions. You must answer the questions, as best as you can, without adding any unrelated content. Additional guidelines: - When quoting variables or names from the code, use backticks (`) instead of single quote ('). - If relevant, use bullet points. - Be short and to the point. Example Hunk Structure: ====== ## File: 'src/file1.py' @@ -12,5 +12,5 @@ def func1(): code line 1 that remained unchanged in the PR code line 2 that remained unchanged in the PR -code line that was removed in the PR +code line added in the PR code line 3 that remained unchanged in the PR ====== """ user="""PR Info: Title: '{{title}}' Branch: '{{branch}}' Here is a context hunk from the PR diff: ====== {{ full_hunk|trim }} ====== Now focus on the selected lines from the hunk: ====== {{ selected_lines|trim }} ====== Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines {%- if conversation_history %} Previous discussion on this code: ====== {{ conversation_history|trim }} ====== Consider this conversation history (format: "N. Username: Message", where numbers indicate the comment order). When responding: - Maintain consistency with previous technical explanations - Address unresolved issues from earlier discussions - Build upon existing knowledge without contradictions - Incorporate relevant context while focusing on the current question {%- endif %} A question about the selected lines: ====== {{ question|trim }} ====== Response to the question: """ ================================================ FILE: pr_agent/settings/pr_questions_prompts.toml ================================================ [pr_questions_prompt] system="""You are PR-Reviewer, a language model designed to answer questions about a Git Pull Request (PR). Your goal is to answer questions\\tasks about the new code introduced in the PR (lines starting with '+' in the 'PR Git Diff' section), and provide feedback. Be informative, constructive, and give examples. Try to be as specific as possible. Don't avoid answering the questions. You must answer the questions, as best as you can, without adding any unrelated content. """ user="""PR Info: Title: '{{title}}' Branch: '{{branch}}' {%- if description %} Description: ====== {{ description|trim }} ====== {%- endif %} {%- if language %} Main PR language: '{{ language }}' {%- endif %} The PR Git Diff: ====== {{ diff|trim }} ====== Note that lines in the diff body are prefixed with a symbol that represents the type of change: '-' for deletions, '+' for additions, and ' ' (a space) for unchanged lines The PR Questions: ====== {{ questions|trim }} ====== Response to the PR Questions: """ ================================================ FILE: pr_agent/settings/pr_reviewer_prompts.toml ================================================ [pr_review_prompt] system="""You are PR-Reviewer, a language model designed to review a Git Pull Request (PR). Your task is to provide constructive and concise feedback for the PR. The review should focus on new code added in the PR code diff (lines starting with '+') The format we will use to present the PR code diff: ====== ## File: 'src/file1.py' {%- if is_ai_metadata %} ### AI-generated changes summary: * ... * ... {%- endif %} @@ ... @@ def func1(): __new hunk__ 11 unchanged code line0 12 unchanged code line1 13 +new code line2 added 14 unchanged code line3 __old hunk__ unchanged code line0 unchanged code line1 -old code line2 removed unchanged code line3 @@ ... @@ def func2(): __new hunk__ unchanged code line4 +new code line5 added unchanged code line6 ## File: 'src/file2.py' ... ====== - In the format above, the diff is organized into separate '__new hunk__' and '__old hunk__' sections for each code chunk. '__new hunk__' contains the updated code, while '__old hunk__' shows the removed code. If no code was removed in a specific chunk, the __old hunk__ section will be omitted. - We also added line numbers for the '__new hunk__' code, to help you refer to the code lines in your suggestions. These line numbers are not part of the actual code, and should only be used for reference. - Code lines are prefixed with symbols ('+', '-', ' '). The '+' symbol indicates new code added in the PR, the '-' symbol indicates code removed in the PR, and the ' ' symbol indicates unchanged code. \ The review should address new code added in the PR code diff (lines starting with '+'). {%- if is_ai_metadata %} - If available, an AI-generated summary will appear and provide a high-level overview of the file changes. Note that this summary may not be fully accurate or complete. {%- endif %} - When quoting variables, names or file paths from the code, use backticks (`) instead of single quote ('). - Note that you only see changed code segments (diff hunks in a PR), not the entire codebase. Avoid suggestions that might duplicate existing functionality or questioning code elements (like variables declarations or import statements) that may be defined elsewhere in the codebase. - Also note that if the code ends at an opening brace or statement that begins a new scope (like 'if', 'for', 'try'), don't treat it as incomplete. Instead, acknowledge the visible scope boundary and analyze only the code shown. {%- if extra_instructions %} Extra instructions from the user: ====== {{ extra_instructions }} ====== {% endif %} The output must be a YAML object equivalent to type $PRReview, according to the following Pydantic definitions: ===== {%- if require_can_be_split_review %} class SubPR(BaseModel): relevant_files: List[str] = Field(description="The relevant files of the sub-PR") title: str = Field(description="Short and concise title for an independent and meaningful sub-PR, composed only from the relevant files") {%- endif %} class KeyIssuesComponentLink(BaseModel): relevant_file: str = Field(description="The full file path of the relevant file") issue_header: str = Field(description="One or two word title for the issue. For example: 'Possible Bug', etc.") issue_content: str = Field(description="A short and concise summary of what should be further inspected and validated during the PR review process for this issue. Do not mention line numbers in this field.") start_line: int = Field(description="The start line that corresponds to this issue in the relevant file") end_line: int = Field(description="The end line that corresponds to this issue in the relevant file") {%- if require_todo_scan %} class TodoSection(BaseModel): relevant_file: str = Field(description="The full path of the file containing the TODO comment") line_number: int = Field(description="The line number where the TODO comment starts") content: str = Field(description="The content of the TODO comment. Only include actual TODO comments within code comments (e.g., comments starting with '#', '//', '/*', '<!--', ...). Remove leading 'TODO' prefixes. If more than 10 words, summarize the TODO comment to a single short sentence up to 10 words.") {%- endif %} {%- if related_tickets %} class TicketCompliance(BaseModel): ticket_url: str = Field(description="Ticket URL or ID") ticket_requirements: str = Field(description="Repeat, in your own words (in bullet points), all the requirements, sub-tasks, DoD, and acceptance criteria raised by the ticket") fully_compliant_requirements: str = Field(description="Bullet-point list of items from the 'ticket_requirements' section above that are fulfilled by the PR code. Don't explain how the requirements are met, just list them shortly. Can be empty") not_compliant_requirements: str = Field(description="Bullet-point list of items from the 'ticket_requirements' section above that are not fulfilled by the PR code. Don't explain how the requirements are not met, just list them shortly. Can be empty") requires_further_human_verification: str = Field(description="Bullet-point list of items from the 'ticket_requirements' section above that cannot be assessed through code review alone, are unclear, or need further human review (e.g., browser testing, UI checks). Leave empty if all 'ticket_requirements' were marked as fully compliant or not compliant") {%- endif %} {%- if require_estimate_contribution_time_cost %} class ContributionTimeCostEstimate(BaseModel): best_case: str = Field(description="An expert in the relevant technology stack, with no unforeseen issues or bugs during the work.", examples=["45m", "5h", "30h"]) average_case: str = Field(description="A senior developer with only brief familiarity with this specific technology stack, and no major unforeseen issues.", examples=["45m", "5h", "30h"]) worst_case: str = Field(description="A senior developer with no prior experience in this specific technology stack, requiring significant time for research, debugging, or resolving unexpected errors.", examples=["45m", "5h", "30h"]) {%- endif %} class Review(BaseModel): {%- if related_tickets %} ticket_compliance_check: List[TicketCompliance] = Field(description="A list of compliance checks for the related tickets") {%- endif %} {%- if require_estimate_effort_to_review %} estimated_effort_to_review_[1-5]: int = Field(description="Estimate, on a scale of 1-5 (inclusive), the time and effort required to review this PR by an experienced and knowledgeable developer. 1 means short and easy review , 5 means long and hard review. Take into account the size, complexity, quality, and the needed changes of the PR code diff.") {%- endif %} {%- if require_estimate_contribution_time_cost %} contribution_time_cost_estimate: ContributionTimeCostEstimate = Field(description="An estimate of the time required to implement the changes, based on the quantity, quality, and complexity of the contribution, as well as the context from the PR description and commit messages.") {%- endif %} {%- if require_score %} score: str = Field(description="Rate this PR on a scale of 0-100 (inclusive), where 0 means the worst possible PR code, and 100 means PR code of the highest quality, without any bugs or performance issues, that is ready to be merged immediately and run in production at scale.") {%- endif %} {%- if require_tests %} relevant_tests: str = Field(description="yes/no question: does this PR have relevant tests added or updated ?") {%- endif %} {%- if question_str %} insights_from_user_answers: str = Field(description="shortly summarize the insights you gained from the user's answers to the questions") {%- endif %} key_issues_to_review: List[KeyIssuesComponentLink] = Field("A short and diverse list (0-{{ num_max_findings }} issues) of high-priority bugs, problems or performance concerns introduced in the PR code, which the PR reviewer should further focus on and validate during the review process.") {%- if require_security_review %} security_concerns: str = Field(description="Does this PR code introduce vulnerabilities such as exposure of sensitive information (e.g., API keys, secrets, passwords), or security concerns like SQL injection, XSS, CSRF, and others ? Answer 'No' (without explaining why) if there are no possible issues. If there are security concerns or issues, start your answer with a short header, such as: 'Sensitive information exposure: ...', 'SQL injection: ...', etc. Explain your answer. Be specific and give examples if possible") {%- endif %} {%- if require_todo_scan %} todo_sections: Union[List[TodoSection], str] = Field(description="A list of TODO comments found in the PR code. Return 'No' (as a string) if there are no TODO comments in the PR") {%- endif %} {%- if require_can_be_split_review %} can_be_split: List[SubPR] = Field(min_items=0, max_items=3, description="Can this PR, which contains {{ num_pr_files }} changed files in total, be divided into smaller sub-PRs with distinct tasks that can be reviewed and merged independently, regardless of the order ? Make sure that the sub-PRs are indeed independent, with no code dependencies between them, and that each sub-PR represent a meaningful independent task. Output an empty list if the PR code does not need to be split.") {%- endif %} class PRReview(BaseModel): review: Review ===== Example output: ```yaml review: {%- if related_tickets %} ticket_compliance_check: - ticket_url: | ... ticket_requirements: | ... fully_compliant_requirements: | ... not_compliant_requirements: | ... overall_compliance_level: | ... {%- endif %} {%- if require_estimate_effort_to_review %} estimated_effort_to_review_[1-5]: | 3 {%- endif %} {%- if require_score %} score: 89 {%- endif %} relevant_tests: | No key_issues_to_review: - relevant_file: | directory/xxx.py issue_header: | Possible Bug issue_content: | ... start_line: 12 end_line: 14 - ... security_concerns: | No {%- if require_todo_scan %} todo_sections: | No {%- endif %} {%- if require_can_be_split_review %} can_be_split: - relevant_files: - ... - ... title: ... - ... {%- endif %} {%- if require_estimate_contribution_time_cost %} contribution_time_cost_estimate: best_case: | ... average_case: | ... worst_case: | ... {%- endif %} ``` Answer should be a valid YAML, and nothing else. Each YAML output MUST be after a newline, with proper indent, and block scalar indicator ('|') """ user=""" {%- if related_tickets %} --PR Ticket Info-- {%- for ticket in related_tickets %} ===== Ticket URL: '{{ ticket.ticket_url }}' Ticket Title: '{{ ticket.title }}' {%- if ticket.labels %} Ticket Labels: {{ ticket.labels }} {%- endif %} {%- if ticket.body %} Ticket Description: ##### {{ ticket.body }} ##### {%- endif %} {%- if ticket.requirements is defined and ticket.requirements %} Ticket Requirements: ##### {{ ticket.requirements }} ##### {%- endif %} ===== {% endfor %} {%- endif %} --PR Info-- {%- if date %} Today's Date: {{date}} {%- endif %} Title: '{{title}}' Branch: '{{branch}}' {%- if description %} PR Description: ====== {{ description|trim }} ====== {%- endif %} {%- if question_str %} ===== Here are questions to better understand the PR. Use the answers to provide better feedback. {{ question_str|trim }} User answers: ' {{ answer_str|trim }} ' ===== {%- endif %} The PR code diff: ====== {{ diff|trim }} ====== {%- if duplicate_prompt_examples %} Example output: ```yaml review: {%- if related_tickets %} ticket_compliance_check: - ticket_url: | ... ticket_requirements: | ... fully_compliant_requirements: | ... not_compliant_requirements: | ... overall_compliance_level: | ... {%- endif %} {%- if require_estimate_effort_to_review %} estimated_effort_to_review_[1-5]: | 3 {%- endif %} {%- if require_score %} score: 89 {%- endif %} relevant_tests: | No key_issues_to_review: - relevant_file: | ... issue_header: | ... issue_content: | ... start_line: ... end_line: ... - ... security_concerns: | No {%- if require_todo_scan %} todo_sections: | No {%- endif %} {%- if require_can_be_split_review %} can_be_split: - relevant_files: - ... - ... title: ... - ... {%- endif %} {%- if require_estimate_contribution_time_cost %} contribution_time_cost_estimate: best_case: | ... average_case: | ... worst_case: | ... {%- endif %} ``` (replace '...' with the actual values) {%- endif %} Response (should be a valid YAML, and nothing else): ```yaml """ ================================================ FILE: pr_agent/settings/pr_update_changelog_prompts.toml ================================================ [pr_update_changelog_prompt] system="""You are a language model called PR-Changelog-Updater. Your task is to add a brief summary of this PR's changes to CHANGELOG.md file of the project: - Follow the file's existing format and style conventions like dates, section titles, etc. - Only add new changes (don't repeat existing entries) - Be general, and avoid specific details, files, etc. The output should be minimal, no more than 3-4 short lines. - Write only the new content to be added to CHANGELOG.md, without any introduction or summary. The content should appear as if it's a natural part of the existing file. {%- if pr_link %} - If relevant, convert the changelog main header into a clickable link using the PR URL '{{ pr_link }}'. Format: header [*](pr_link) {%- endif %} {%- if extra_instructions %} Extra instructions from the user: ====== {{ extra_instructions|trim }} ====== {%- endif %} """ user="""PR Info: Title: '{{title}}' Branch: '{{branch}}' {%- if description %} Description: ====== {{ description|trim }} ====== {%- endif %} {%- if language %} Main PR language: '{{ language }}' {%- endif %} {%- if commit_messages_str %} Commit messages: ====== {{ commit_messages_str|trim }} ====== {%- endif %} The PR Git Diff: ====== {{ diff|trim }} ====== Current date: ``` {{today}} ``` The current 'CHANGELOG.md' file ====== {{ changelog_file_str }} ====== Response: ```markdown """ ================================================ FILE: pr_agent/tools/__init__.py ================================================ ================================================ FILE: pr_agent/tools/pr_add_docs.py ================================================ import copy import textwrap from functools import partial from typing import Dict from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import load_yaml from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.log import get_logger class PRAddDocs: def __init__(self, pr_url: str, cli_mode=False, args: list = None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): self.git_provider = get_git_provider()(pr_url) self.main_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.ai_handler = ai_handler() self.ai_handler.main_pr_language = self.main_language self.patches_diff = None self.prediction = None self.cli_mode = cli_mode self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), "description": self.git_provider.get_pr_description(), "language": self.main_language, "diff": "", # empty diff for initial calculation "extra_instructions": get_settings().pr_add_docs.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), 'docs_for_language': get_docs_for_language(self.main_language, get_settings().pr_add_docs.docs_style), } self.token_handler = TokenHandler(self.git_provider.pr, self.vars, get_settings().pr_add_docs_prompt.system, get_settings().pr_add_docs_prompt.user) async def run(self): try: get_logger().info('Generating code Docs for PR...') if get_settings().config.publish_output: self.git_provider.publish_comment("Generating Documentation...", is_temporary=True) get_logger().info('Preparing PR documentation...') await retry_with_fallback_models(self._prepare_prediction) data = self._prepare_pr_code_docs() if (not data) or (not 'Code Documentation' in data): get_logger().info('No code documentation found for PR.') return if get_settings().config.publish_output: get_logger().info('Pushing PR documentation...') self.git_provider.remove_initial_comment() get_logger().info('Pushing inline code documentation...') self.push_inline_docs(data) except Exception as e: get_logger().error(f"Failed to generate code documentation for PR, error: {e}") async def _prepare_prediction(self, model: str): get_logger().info('Getting PR diff...') self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model, add_line_numbers_to_hunks=True, disable_extra_lines=False) get_logger().info('Getting AI prediction...') self.prediction = await self._get_prediction(model) async def _get_prediction(self, model: str): variables = copy.deepcopy(self.vars) variables["diff"] = self.patches_diff # update diff environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(get_settings().pr_add_docs_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_add_docs_prompt.user).render(variables) if get_settings().config.verbosity_level >= 2: get_logger().info(f"\nSystem prompt:\n{system_prompt}") get_logger().info(f"\nUser prompt:\n{user_prompt}") response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) return response def _prepare_pr_code_docs(self) -> Dict: docs = self.prediction.strip() data = load_yaml(docs) if isinstance(data, list): data = {'Code Documentation': data} return data def push_inline_docs(self, data): docs = [] if not data['Code Documentation']: return self.git_provider.publish_comment('No code documentation found to improve this PR.') for d in data['Code Documentation']: try: if get_settings().config.verbosity_level >= 2: get_logger().info(f"add_docs: {d}") relevant_file = d['relevant file'].strip() relevant_line = int(d['relevant line']) # absolute position documentation = d['documentation'] doc_placement = d['doc placement'].strip() if documentation: new_code_snippet = self.dedent_code(relevant_file, relevant_line, documentation, doc_placement, add_original_line=True) body = f"**Suggestion:** Proposed documentation\n```suggestion\n" + new_code_snippet + "\n```" docs.append({'body': body, 'relevant_file': relevant_file, 'relevant_lines_start': relevant_line, 'relevant_lines_end': relevant_line}) except Exception: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Could not parse code docs: {d}") is_successful = self.git_provider.publish_code_suggestions(docs) if not is_successful: get_logger().info("Failed to publish code docs, trying to publish each docs separately") for doc_suggestion in docs: self.git_provider.publish_code_suggestions([doc_suggestion]) def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet, doc_placement='after', add_original_line=False): try: # dedent code snippet self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files \ else self.git_provider.get_diff_files() original_initial_line = None for file in self.diff_files: if file.filename.strip() == relevant_file: original_initial_line = file.head_file.splitlines()[relevant_lines_start - 1] break if original_initial_line: if doc_placement == 'after': line = file.head_file.splitlines()[relevant_lines_start] else: line = original_initial_line suggested_initial_line = new_code_snippet.splitlines()[0] original_initial_spaces = len(line) - len(line.lstrip()) suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) delta_spaces = original_initial_spaces - suggested_initial_spaces if delta_spaces > 0: new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * " ").rstrip('\n') if add_original_line: if doc_placement == 'after': new_code_snippet = original_initial_line + "\n" + new_code_snippet else: new_code_snippet = new_code_snippet.rstrip() + "\n" + original_initial_line except Exception as e: if get_settings().config.verbosity_level >= 2: get_logger().info(f"Could not dedent code snippet for file {relevant_file}, error: {e}") return new_code_snippet def get_docs_for_language(language, style): language = language.lower() if language == 'java': return "Javadocs" elif language in ['python', 'lisp', 'clojure']: return f"Docstring ({style})" elif language in ['javascript', 'typescript']: return "JSdocs" elif language == 'c++': return "Doxygen" else: return "Docs" ================================================ FILE: pr_agent/tools/pr_code_suggestions.py ================================================ import asyncio import copy import difflib import re import textwrap import traceback from datetime import datetime from functools import partial from typing import Dict, List from jinja2 import Environment, StrictUndefined from pr_agent.algo import MAX_TOKENS from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.git_patch_processing import decouple_and_convert_to_hunks_with_lines_numbers from pr_agent.algo.pr_processing import (add_ai_metadata_to_diff_files, get_pr_diff, get_pr_multi_diffs, retry_with_fallback_models) from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import (ModelType, load_yaml, replace_code_tags, show_relevant_configurations, get_max_tokens, clip_tokens, get_model) from pr_agent.config_loader import get_settings from pr_agent.git_providers import (AzureDevopsProvider, GithubProvider, GitLabProvider, get_git_provider, get_git_provider_with_context) from pr_agent.git_providers.git_provider import get_main_pr_language, GitProvider from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage from pr_agent.tools.pr_description import insert_br_after_x_chars class PRCodeSuggestions: def __init__(self, pr_url: str, cli_mode=False, args: list = None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): self.git_provider = get_git_provider_with_context(pr_url) self.main_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) num_code_suggestions = int(get_settings().pr_code_suggestions.num_code_suggestions_per_chunk) self.ai_handler = ai_handler() self.ai_handler.main_pr_language = self.main_language self.patches_diff = None self.prediction = None self.pr_url = pr_url self.cli_mode = cli_mode self.pr_description, self.pr_description_files = ( self.git_provider.get_pr_description(split_changes_walkthrough=True)) if (self.pr_description_files and get_settings().get("config.is_auto_command", False) and get_settings().get("config.enable_ai_metadata", False)): add_ai_metadata_to_diff_files(self.git_provider, self.pr_description_files) get_logger().debug(f"AI metadata added to the this command") else: get_settings().set("config.enable_ai_metadata", False) get_logger().debug(f"AI metadata is disabled for this command") self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), "description": self.pr_description, "language": self.main_language, "diff": "", # empty diff for initial calculation "diff_no_line_numbers": "", # empty diff for initial calculation "num_code_suggestions": num_code_suggestions, "extra_instructions": get_settings().pr_code_suggestions.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), "relevant_best_practices": "", "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), "focus_only_on_problems": get_settings().get("pr_code_suggestions.focus_only_on_problems", False), "date": datetime.now().strftime('%Y-%m-%d'), 'duplicate_prompt_examples': get_settings().config.get('duplicate_prompt_examples', False), } if get_settings().pr_code_suggestions.get("decouple_hunks", True): self.pr_code_suggestions_prompt_system = get_settings().pr_code_suggestions_prompt.system self.pr_code_suggestions_prompt_user = get_settings().pr_code_suggestions_prompt.user else: self.pr_code_suggestions_prompt_system = get_settings().pr_code_suggestions_prompt_not_decoupled.system self.pr_code_suggestions_prompt_user = get_settings().pr_code_suggestions_prompt_not_decoupled.user self.token_handler = TokenHandler(self.git_provider.pr, self.vars, self.pr_code_suggestions_prompt_system, self.pr_code_suggestions_prompt_user) self.progress = f"## Generating PR code suggestions\n\n" self.progress += f"""\nWork in progress ...<br>\n<img src="https://codium.ai/images/pr_agent/dual_ball_loading-crop.gif" width=48>""" self.progress_response = None async def run(self): try: if not self.git_provider.get_files(): get_logger().info(f"PR has no files: {self.pr_url}, skipping code suggestions") return None get_logger().info('Generating code suggestions for PR...') relevant_configs = {'pr_code_suggestions': dict(get_settings().pr_code_suggestions), 'config': dict(get_settings().config)} get_logger().debug("Relevant configs", artifacts=relevant_configs) # publish "Preparing suggestions..." comments if (get_settings().config.publish_output and get_settings().config.publish_output_progress and not get_settings().config.get('is_auto_command', False)): if self.git_provider.is_supported("gfm_markdown"): self.progress_response = self.git_provider.publish_comment(self.progress) else: self.git_provider.publish_comment("Preparing suggestions...", is_temporary=True) # # call the model to get the suggestions, and self-reflect on them # if not self.is_extended: # data = await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.REGULAR) # else: data = await retry_with_fallback_models(self.prepare_prediction_main, model_type=ModelType.REGULAR) if not data: data = {"code_suggestions": []} self.data = data # Handle the case where the PR has no suggestions if (data is None or 'code_suggestions' not in data or not data['code_suggestions']): await self.publish_no_suggestions() return # publish the suggestions if get_settings().config.publish_output: # If a temporary comment was published, remove it self.git_provider.remove_initial_comment() # Publish table summarized suggestions if ((not get_settings().pr_code_suggestions.commitable_code_suggestions) and self.git_provider.is_supported("gfm_markdown")): # generate summarized suggestions pr_body = self.generate_summarized_suggestions(data) get_logger().debug(f"PR output", artifact=pr_body) # require self-review if get_settings().pr_code_suggestions.demand_code_suggestions_self_review: pr_body = await self.add_self_review_text(pr_body) # add usage guide if (get_settings().pr_code_suggestions.enable_chat_text and get_settings().config.is_auto_command and isinstance(self.git_provider, GithubProvider)): pr_body += "\n\n>💡 Need additional feedback ? start a [PR chat](https://chromewebstore.google.com/detail/ephlnjeghhogofkifjloamocljapahnl) \n\n" if get_settings().pr_code_suggestions.enable_help_text: pr_body += "<hr>\n\n<details> <summary><strong>💡 Tool usage guide:</strong></summary><hr> \n\n" pr_body += HelpMessage.get_improve_usage_guide() pr_body += "\n</details>\n" # Output the relevant configurations if enabled if get_settings().get('config', {}).get('output_relevant_configurations', False): pr_body += show_relevant_configurations(relevant_section='pr_code_suggestions') # publish the PR comment if get_settings().pr_code_suggestions.persistent_comment: # true by default self.publish_persistent_comment_with_history(self.git_provider, pr_body, initial_header="## PR Code Suggestions ✨", update_header=True, name="suggestions", final_update_message=False, max_previous_comments=get_settings().pr_code_suggestions.max_history_len, progress_response=self.progress_response) else: if self.progress_response: self.git_provider.edit_comment(self.progress_response, body=pr_body) else: self.git_provider.publish_comment(pr_body) # dual publishing mode if int(get_settings().pr_code_suggestions.dual_publishing_score_threshold) > 0: await self.dual_publishing(data) else: await self.push_inline_code_suggestions(data) if self.progress_response: self.git_provider.remove_comment(self.progress_response) else: get_logger().info('Code suggestions generated for PR, but not published since publish_output is False.') pr_body = self.generate_summarized_suggestions(data) get_settings().data = {"artifact": pr_body} return except Exception as e: get_logger().error(f"Failed to generate code suggestions for PR, error: {e}", artifact={"traceback": traceback.format_exc()}) if get_settings().config.publish_output: if self.progress_response: self.git_provider.remove_comment(self.progress_response) else: try: self.git_provider.remove_initial_comment() self.git_provider.publish_comment(f"Failed to generate code suggestions for PR") except Exception as e: get_logger().exception(f"Failed to update persistent review, error: {e}") async def add_self_review_text(self, pr_body): text = get_settings().pr_code_suggestions.code_suggestions_self_review_text pr_body += f"\n\n- [ ] {text}" approve_pr_on_self_review = get_settings().pr_code_suggestions.approve_pr_on_self_review fold_suggestions_on_self_review = get_settings().pr_code_suggestions.fold_suggestions_on_self_review if approve_pr_on_self_review and not fold_suggestions_on_self_review: pr_body += ' <!-- approve pr self-review -->' elif fold_suggestions_on_self_review and not approve_pr_on_self_review: pr_body += ' <!-- fold suggestions self-review -->' else: pr_body += ' <!-- approve and fold suggestions self-review -->' return pr_body async def publish_no_suggestions(self): pr_body = "## PR Code Suggestions ✨\n\nNo code suggestions found for the PR." if (get_settings().config.publish_output and get_settings().pr_code_suggestions.get('publish_output_no_suggestions', True)): get_logger().warning('No code suggestions found for the PR.') get_logger().debug(f"PR output", artifact=pr_body) if self.progress_response: self.git_provider.edit_comment(self.progress_response, body=pr_body) else: self.git_provider.publish_comment(pr_body) else: get_settings().data = {"artifact": ""} async def dual_publishing(self, data): data_above_threshold = {'code_suggestions': []} try: for suggestion in data['code_suggestions']: if int(suggestion.get('score', 0)) >= int( get_settings().pr_code_suggestions.dual_publishing_score_threshold) \ and suggestion.get('improved_code'): data_above_threshold['code_suggestions'].append(suggestion) if not data_above_threshold['code_suggestions'][-1]['existing_code']: get_logger().info(f'Identical existing and improved code for dual publishing found') data_above_threshold['code_suggestions'][-1]['existing_code'] = suggestion[ 'improved_code'] if data_above_threshold['code_suggestions']: get_logger().info( f"Publishing {len(data_above_threshold['code_suggestions'])} suggestions in dual publishing mode") await self.push_inline_code_suggestions(data_above_threshold) except Exception as e: get_logger().error(f"Failed to publish dual publishing suggestions, error: {e}") @staticmethod def publish_persistent_comment_with_history(git_provider: GitProvider, pr_comment: str, initial_header: str, update_header: bool = True, name='review', final_update_message=True, max_previous_comments=4, progress_response=None, only_fold=False): def _extract_link(comment_text: str): r = re.compile(r"<!--.*?-->") match = r.search(comment_text) up_to_commit_txt = "" if match: up_to_commit_txt = f" up to commit {match.group(0)[4:-3].strip()}" return up_to_commit_txt history_header = f"#### Previous suggestions\n" last_commit_num = git_provider.get_latest_commit_url().split('/')[-1][:7] if only_fold: # A user clicked on the 'self-review' checkbox text = get_settings().pr_code_suggestions.code_suggestions_self_review_text latest_suggestion_header = f"\n\n- [x] {text}" else: latest_suggestion_header = f"Latest suggestions up to {last_commit_num}" latest_commit_html_comment = f"<!-- {last_commit_num} -->" found_comment = None if max_previous_comments > 0: try: prev_comments = list(git_provider.get_issue_comments()) for comment in prev_comments: if comment.body.startswith(initial_header): prev_suggestions = comment.body found_comment = comment comment_url = git_provider.get_comment_url(comment) if history_header.strip() not in comment.body: # no history section # extract everything between <table> and </table> in comment.body including <table> and </table> table_index = comment.body.find("<table>") if table_index == -1: git_provider.edit_comment(comment, pr_comment) continue # find http link from comment.body[:table_index] up_to_commit_txt = _extract_link(comment.body[:table_index]) prev_suggestion_table = comment.body[ table_index:comment.body.rfind("</table>") + len("</table>")] tick = "✅ " if "✅" in prev_suggestion_table else "" # surround with details tag prev_suggestion_table = f"<details><summary>{tick}{name.capitalize()}{up_to_commit_txt}</summary>\n<br>{prev_suggestion_table}\n\n</details>" new_suggestion_table = pr_comment.replace(initial_header, "").strip() pr_comment_updated = f"{initial_header}\n{latest_commit_html_comment}\n\n" pr_comment_updated += f"{latest_suggestion_header}\n{new_suggestion_table}\n\n___\n\n" pr_comment_updated += f"{history_header}{prev_suggestion_table}\n" else: # get the text of the previous suggestions until the latest commit sections = prev_suggestions.split(history_header.strip()) latest_table = sections[0].strip() prev_suggestion_table = sections[1].replace(history_header, "").strip() # get text after the latest_suggestion_header in comment.body table_ind = latest_table.find("<table>") up_to_commit_txt = _extract_link(latest_table[:table_ind]) latest_table = latest_table[table_ind:latest_table.rfind("</table>") + len("</table>")] # enforce max_previous_comments count = prev_suggestions.count(f"\n<details><summary>{name.capitalize()}") count += prev_suggestions.count(f"\n<details><summary>✅ {name.capitalize()}") if count >= max_previous_comments: # remove the oldest suggestion prev_suggestion_table = prev_suggestion_table[:prev_suggestion_table.rfind( f"<details><summary>{name.capitalize()} up to commit")] tick = "✅ " if "✅" in latest_table else "" # Add to the prev_suggestions section last_prev_table = f"\n<details><summary>{tick}{name.capitalize()}{up_to_commit_txt}</summary>\n<br>{latest_table}\n\n</details>" prev_suggestion_table = last_prev_table + "\n" + prev_suggestion_table new_suggestion_table = pr_comment.replace(initial_header, "").strip() pr_comment_updated = f"{initial_header}\n" pr_comment_updated += f"{latest_commit_html_comment}\n\n" pr_comment_updated += f"{latest_suggestion_header}\n\n{new_suggestion_table}\n\n" pr_comment_updated += "___\n\n" pr_comment_updated += f"{history_header}\n" pr_comment_updated += f"{prev_suggestion_table}\n" get_logger().info(f"Persistent mode - updating comment {comment_url} to latest {name} message") if progress_response: # publish to 'progress_response' comment, because it refreshes immediately git_provider.edit_comment(progress_response, pr_comment_updated) git_provider.remove_comment(comment) comment = progress_response else: git_provider.edit_comment(comment, pr_comment_updated) return comment except Exception as e: get_logger().exception(f"Failed to update persistent review, error: {e}") pass # if we are here, we did not find a previous comment to update body = pr_comment.replace(initial_header, "").strip() pr_comment = f"{initial_header}\n\n{latest_commit_html_comment}\n\n{body}\n\n" if progress_response: git_provider.edit_comment(progress_response, pr_comment) new_comment = progress_response else: new_comment = git_provider.publish_comment(pr_comment) return new_comment def extract_link(self, s): r = re.compile(r"<!--.*?-->") match = r.search(s) up_to_commit_txt = "" if match: up_to_commit_txt = f" up to commit {match.group(0)[4:-3].strip()}" return up_to_commit_txt async def _prepare_prediction(self, model: str) -> dict: self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model, add_line_numbers_to_hunks=True, disable_extra_lines=False) self.patches_diff_list = [self.patches_diff] self.patches_diff_no_line_number = self.remove_line_numbers([self.patches_diff])[0] if self.patches_diff: get_logger().debug(f"PR diff", artifact=self.patches_diff) self.prediction = await self._get_prediction(model, self.patches_diff, self.patches_diff_no_line_number) else: get_logger().warning(f"Empty PR diff") self.prediction = None data = self.prediction return data async def _get_prediction(self, model: str, patches_diff: str, patches_diff_no_line_number: str) -> dict: variables = copy.deepcopy(self.vars) variables["diff"] = patches_diff # update diff variables["diff_no_line_numbers"] = patches_diff_no_line_number # update diff environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(self.pr_code_suggestions_prompt_system).render(variables) user_prompt = environment.from_string(get_settings().pr_code_suggestions_prompt.user).render(variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) if not get_settings().config.publish_output: get_settings().system_prompt = system_prompt get_settings().user_prompt = user_prompt # load suggestions from the AI response data = self._prepare_pr_code_suggestions(response) # self-reflect on suggestions (mandatory, since line numbers are generated now here) model_reflect_with_reasoning = get_model('model_reasoning') fallbacks = get_settings().config.fallback_models if model_reflect_with_reasoning == get_settings().config.model and model != get_settings().config.model and fallbacks and model == \ fallbacks[0]: # we are using a fallback model (should not happen on regular conditions) get_logger().warning(f"Using the same model for self-reflection as the one used for suggestions") model_reflect_with_reasoning = model response_reflect = await self.self_reflect_on_suggestions(data["code_suggestions"], patches_diff, model=model_reflect_with_reasoning) if response_reflect: await self.analyze_self_reflection_response(data, response_reflect) else: # get_logger().error(f"Could not self-reflect on suggestions. using default score 7") for i, suggestion in enumerate(data["code_suggestions"]): suggestion["score"] = 7 suggestion["score_why"] = "" return data async def analyze_self_reflection_response(self, data, response_reflect): response_reflect_yaml = load_yaml(response_reflect) code_suggestions_feedback = response_reflect_yaml.get("code_suggestions", []) if code_suggestions_feedback and len(code_suggestions_feedback) == len(data["code_suggestions"]): for i, suggestion in enumerate(data["code_suggestions"]): try: suggestion["score"] = code_suggestions_feedback[i]["suggestion_score"] suggestion["score_why"] = code_suggestions_feedback[i]["why"] if 'relevant_lines_start' not in suggestion: relevant_lines_start = code_suggestions_feedback[i].get('relevant_lines_start', -1) relevant_lines_end = code_suggestions_feedback[i].get('relevant_lines_end', -1) suggestion['relevant_lines_start'] = relevant_lines_start suggestion['relevant_lines_end'] = relevant_lines_end if relevant_lines_start < 0 or relevant_lines_end < 0: suggestion["score"] = 0 try: if get_settings().config.publish_output: if not suggestion["score"]: score = -1 else: score = int(suggestion["score"]) label = suggestion["label"].lower().strip() label = label.replace('<br>', ' ') suggestion_statistics_dict = {'score': score, 'label': label} get_logger().info(f"PR-Agent suggestions statistics", statistics=suggestion_statistics_dict, analytics=True) except Exception as e: get_logger().error(f"Failed to log suggestion statistics, error: {e}") pass except Exception as e: # get_logger().error(f"Error processing suggestion score {i}", artifact={"suggestion": suggestion, "code_suggestions_feedback": code_suggestions_feedback[i]}) suggestion["score"] = 7 suggestion["score_why"] = "" suggestion = self.validate_one_liner_suggestion_not_repeating_code(suggestion) # if the before and after code is the same, clear one of them try: if suggestion['existing_code'] == suggestion['improved_code']: get_logger().debug( f"edited improved suggestion {i + 1}, because equal to existing code: {suggestion['existing_code']}") if get_settings().pr_code_suggestions.commitable_code_suggestions: suggestion['improved_code'] = "" # we need 'existing_code' to locate the code in the PR else: suggestion['existing_code'] = "" except Exception as e: get_logger().error(f"Error processing suggestion {i + 1}, error: {e}") @staticmethod def _truncate_if_needed(suggestion): max_code_suggestion_length = get_settings().get("PR_CODE_SUGGESTIONS.MAX_CODE_SUGGESTION_LENGTH", 0) suggestion_truncation_message = get_settings().get("PR_CODE_SUGGESTIONS.SUGGESTION_TRUNCATION_MESSAGE", "") if max_code_suggestion_length > 0: if len(suggestion['improved_code']) > max_code_suggestion_length: get_logger().info(f"Truncated suggestion from {len(suggestion['improved_code'])} " f"characters to {max_code_suggestion_length} characters") suggestion['improved_code'] = suggestion['improved_code'][:max_code_suggestion_length] suggestion['improved_code'] += f"\n{suggestion_truncation_message}" return suggestion def _prepare_pr_code_suggestions(self, predictions: str) -> Dict: data = load_yaml(predictions.strip(), keys_fix_yaml=["relevant_file", "suggestion_content", "existing_code", "improved_code"], first_key="code_suggestions", last_key="label") if isinstance(data, list): data = {'code_suggestions': data} # remove or edit invalid suggestions suggestion_list = [] one_sentence_summary_list = [] for i, suggestion in enumerate(data['code_suggestions']): try: needed_keys = ['one_sentence_summary', 'label', 'relevant_file'] is_valid_keys = True for key in needed_keys: if key not in suggestion: is_valid_keys = False get_logger().debug( f"Skipping suggestion {i + 1}, because it does not contain '{key}':\n'{suggestion}") break if not is_valid_keys: continue if get_settings().get("pr_code_suggestions.focus_only_on_problems", False): CRITICAL_LABEL = 'critical' if CRITICAL_LABEL in suggestion['label'].lower(): # we want the published labels to be less declarative suggestion['label'] = 'possible issue' if suggestion['one_sentence_summary'] in one_sentence_summary_list: get_logger().debug(f"Skipping suggestion {i + 1}, because it is a duplicate: {suggestion}") continue if 'const' in suggestion['suggestion_content'] and 'instead' in suggestion[ 'suggestion_content'] and 'let' in suggestion['suggestion_content']: get_logger().debug( f"Skipping suggestion {i + 1}, because it uses 'const instead let': {suggestion}") continue if ('existing_code' in suggestion) and ('improved_code' in suggestion): suggestion = self._truncate_if_needed(suggestion) one_sentence_summary_list.append(suggestion['one_sentence_summary']) suggestion_list.append(suggestion) else: get_logger().info( f"Skipping suggestion {i + 1}, because it does not contain 'existing_code' or 'improved_code': {suggestion}") except Exception as e: get_logger().error(f"Error processing suggestion {i + 1}: {suggestion}, error: {e}") data['code_suggestions'] = suggestion_list return data async def push_inline_code_suggestions(self, data): code_suggestions = [] if not data['code_suggestions']: get_logger().info('No suggestions found to improve this PR.') if self.progress_response: return self.git_provider.edit_comment(self.progress_response, body='No suggestions found to improve this PR.') else: return self.git_provider.publish_comment('No suggestions found to improve this PR.') for d in data['code_suggestions']: try: if get_settings().config.verbosity_level >= 2: get_logger().info(f"suggestion: {d}") relevant_file = d['relevant_file'].strip() relevant_lines_start = int(d['relevant_lines_start']) # absolute position relevant_lines_end = int(d['relevant_lines_end']) content = d['suggestion_content'].rstrip() new_code_snippet = d['improved_code'].rstrip() label = d['label'].strip() if new_code_snippet: new_code_snippet = self.dedent_code(relevant_file, relevant_lines_start, new_code_snippet) if d.get('score'): body = f"**Suggestion:** {content} [{label}, importance: {d.get('score')}]\n```suggestion\n" + new_code_snippet + "\n```" else: body = f"**Suggestion:** {content} [{label}]\n```suggestion\n" + new_code_snippet + "\n```" code_suggestions.append({'body': body, 'relevant_file': relevant_file, 'relevant_lines_start': relevant_lines_start, 'relevant_lines_end': relevant_lines_end, 'original_suggestion': d}) except Exception: get_logger().info(f"Could not parse suggestion: {d}") is_successful = self.git_provider.publish_code_suggestions(code_suggestions) if not is_successful: get_logger().info("Failed to publish code suggestions, trying to publish each suggestion separately") for code_suggestion in code_suggestions: self.git_provider.publish_code_suggestions([code_suggestion]) def dedent_code(self, relevant_file, relevant_lines_start, new_code_snippet): try: # dedent code snippet self.diff_files = self.git_provider.diff_files if self.git_provider.diff_files \ else self.git_provider.get_diff_files() original_initial_line = None for file in self.diff_files: if file.filename.strip() == relevant_file: if file.head_file: file_lines = file.head_file.splitlines() if relevant_lines_start > len(file_lines): get_logger().warning( "Could not dedent code snippet, because relevant_lines_start is out of range", artifact={'filename': file.filename, 'file_content': file.head_file, 'relevant_lines_start': relevant_lines_start, 'new_code_snippet': new_code_snippet}) return new_code_snippet else: original_initial_line = file_lines[relevant_lines_start - 1] else: get_logger().warning("Could not dedent code snippet, because head_file is missing", artifact={'filename': file.filename, 'relevant_lines_start': relevant_lines_start, 'new_code_snippet': new_code_snippet}) return new_code_snippet break if original_initial_line: suggested_initial_line = new_code_snippet.splitlines()[0] original_initial_spaces = len(original_initial_line) - len(original_initial_line.lstrip()) # lstrip works both for spaces and tabs suggested_initial_spaces = len(suggested_initial_line) - len(suggested_initial_line.lstrip()) delta_spaces = original_initial_spaces - suggested_initial_spaces if delta_spaces > 0: # Detect indentation character from original line indent_char = '\t' if original_initial_line.startswith('\t') else ' ' new_code_snippet = textwrap.indent(new_code_snippet, delta_spaces * indent_char).rstrip('\n') except Exception as e: get_logger().error(f"Error when dedenting code snippet for file {relevant_file}, error: {e}") return new_code_snippet def validate_one_liner_suggestion_not_repeating_code(self, suggestion): try: existing_code = suggestion.get('existing_code', '').strip() if '...' in existing_code: return suggestion new_code = suggestion.get('improved_code', '').strip() relevant_file = suggestion.get('relevant_file', '').strip() diff_files = self.git_provider.get_diff_files() for file in diff_files: if file.filename.strip() == relevant_file: # protections if not file.head_file: get_logger().info(f"head_file is empty") return suggestion head_file = file.head_file base_file = file.base_file if existing_code in base_file and existing_code not in head_file and new_code in head_file: suggestion["score"] = 0 get_logger().warning( f"existing_code is in the base file but not in the head file, setting score to 0", artifact={"suggestion": suggestion}) except Exception as e: get_logger().exception(f"Error validating one-liner suggestion", artifact={"error": e}) return suggestion def remove_line_numbers(self, patches_diff_list: List[str]) -> List[str]: # create a copy of the patches_diff_list, without line numbers for '__new hunk__' sections try: self.patches_diff_list_no_line_numbers = [] for patches_diff in self.patches_diff_list: patches_diff_lines = patches_diff.splitlines() for i, line in enumerate(patches_diff_lines): if line.strip(): if line.isnumeric(): patches_diff_lines[i] = '' elif line[0].isdigit(): # find the first letter in the line that starts with a valid letter for j, char in enumerate(line): if not char.isdigit(): patches_diff_lines[i] = line[j + 1:] break self.patches_diff_list_no_line_numbers.append('\n'.join(patches_diff_lines)) return self.patches_diff_list_no_line_numbers except Exception as e: get_logger().error(f"Error removing line numbers from patches_diff_list, error: {e}") return patches_diff_list async def prepare_prediction_main(self, model: str) -> dict: # get PR diff if get_settings().pr_code_suggestions.decouple_hunks: self.patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, max_calls=get_settings().pr_code_suggestions.max_number_of_calls, add_line_numbers=True) # decouple hunk with line numbers self.patches_diff_list_no_line_numbers = self.remove_line_numbers(self.patches_diff_list) # decouple hunk else: # non-decoupled hunks self.patches_diff_list_no_line_numbers = get_pr_multi_diffs(self.git_provider, self.token_handler, model, max_calls=get_settings().pr_code_suggestions.max_number_of_calls, add_line_numbers=False) self.patches_diff_list = await self.convert_to_decoupled_with_line_numbers( self.patches_diff_list_no_line_numbers, model) if not self.patches_diff_list: # fallback to decoupled hunks self.patches_diff_list = get_pr_multi_diffs(self.git_provider, self.token_handler, model, max_calls=get_settings().pr_code_suggestions.max_number_of_calls, add_line_numbers=True) # decouple hunk with line numbers if self.patches_diff_list: get_logger().info(f"Number of PR chunk calls: {len(self.patches_diff_list)}") get_logger().debug(f"PR diff:", artifact=self.patches_diff_list) # parallelize calls to AI: if get_settings().pr_code_suggestions.parallel_calls: prediction_list = await asyncio.gather( *[self._get_prediction(model, patches_diff, patches_diff_no_line_numbers) for patches_diff, patches_diff_no_line_numbers in zip(self.patches_diff_list, self.patches_diff_list_no_line_numbers)]) self.prediction_list = prediction_list else: prediction_list = [] for patches_diff, patches_diff_no_line_numbers in zip(self.patches_diff_list, self.patches_diff_list_no_line_numbers): prediction = await self._get_prediction(model, patches_diff, patches_diff_no_line_numbers) prediction_list.append(prediction) data = {"code_suggestions": []} for j, predictions in enumerate(prediction_list): # each call adds an element to the list if "code_suggestions" in predictions: score_threshold = max(1, int(get_settings().pr_code_suggestions.suggestions_score_threshold)) for i, prediction in enumerate(predictions["code_suggestions"]): try: score = int(prediction.get("score", 1)) if score >= score_threshold: data["code_suggestions"].append(prediction) else: get_logger().info( f"Removing suggestions {i} from call {j}, because score is {score}, and score_threshold is {score_threshold}", artifact=prediction) except Exception as e: get_logger().error(f"Error getting PR diff for suggestion {i} in call {j}, error: {e}", artifact={"prediction": prediction}) self.data = data else: get_logger().warning(f"Empty PR diff list") self.data = data = None return data async def convert_to_decoupled_with_line_numbers(self, patches_diff_list_no_line_numbers, model) -> List[str]: with get_logger().contextualize(sub_feature='convert_to_decoupled_with_line_numbers'): try: patches_diff_list = [] for patch_prompt in patches_diff_list_no_line_numbers: file_prefix = "## File: " patches = patch_prompt.strip().split(f"\n{file_prefix}") patches_new = copy.deepcopy(patches) for i in range(len(patches_new)): if i == 0: prefix = patches_new[i].split("\n@@")[0].strip() else: prefix = file_prefix + patches_new[i].split("\n@@")[0][1:] prefix = prefix.strip() patches_new[i] = prefix + '\n\n' + decouple_and_convert_to_hunks_with_lines_numbers(patches_new[i], file=None).strip() patches_new[i] = patches_new[i].strip() patch_final = "\n\n\n".join(patches_new) if model in MAX_TOKENS: max_tokens_full = MAX_TOKENS[ model] # note - here we take the actual max tokens, without any reductions. we do aim to get the full documentation website in the prompt else: max_tokens_full = get_max_tokens(model) delta_output = 2000 token_count = self.token_handler.count_tokens(patch_final) if token_count > max_tokens_full - delta_output: get_logger().warning( f"Token count {token_count} exceeds the limit {max_tokens_full - delta_output}. clipping the tokens") patch_final = clip_tokens(patch_final, max_tokens_full - delta_output) patches_diff_list.append(patch_final) return patches_diff_list except Exception as e: get_logger().exception(f"Error converting to decoupled with line numbers", artifact={'patches_diff_list_no_line_numbers': patches_diff_list_no_line_numbers}) return [] def generate_summarized_suggestions(self, data: Dict) -> str: try: pr_body = "## PR Code Suggestions ✨\n\n" if len(data.get('code_suggestions', [])) == 0: pr_body += "No suggestions found to improve this PR." return pr_body if get_settings().config.is_auto_command: pr_body += "Explore these optional code suggestions:\n\n" language_extension_map_org = get_settings().language_extension_map_org extension_to_language = {} for language, extensions in language_extension_map_org.items(): for ext in extensions: extension_to_language[ext] = language pr_body += "<table>" header = f"Suggestion" delta = 66 header += "  " * delta pr_body += f"""<thead><tr><td><strong>Category</strong></td><td align=left><strong>{header}</strong></td><td align=center><strong>Impact</strong></td></tr>""" pr_body += """<tbody>""" suggestions_labels = dict() # add all suggestions related to each label for suggestion in data['code_suggestions']: label = suggestion['label'].strip().strip("'").strip('"') if label not in suggestions_labels: suggestions_labels[label] = [] suggestions_labels[label].append(suggestion) # sort suggestions_labels by the suggestion with the highest score suggestions_labels = dict( sorted(suggestions_labels.items(), key=lambda x: max([s['score'] for s in x[1]]), reverse=True)) # sort the suggestions inside each label group by score for label, suggestions in suggestions_labels.items(): suggestions_labels[label] = sorted(suggestions, key=lambda x: x['score'], reverse=True) counter_suggestions = 0 for label, suggestions in suggestions_labels.items(): num_suggestions = len(suggestions) pr_body += f"""<tr><td rowspan={num_suggestions}>{label.capitalize()}</td>\n""" for i, suggestion in enumerate(suggestions): relevant_file = suggestion['relevant_file'].strip() relevant_lines_start = int(suggestion['relevant_lines_start']) relevant_lines_end = int(suggestion['relevant_lines_end']) range_str = "" if relevant_lines_start == relevant_lines_end: range_str = f"[{relevant_lines_start}]" else: range_str = f"[{relevant_lines_start}-{relevant_lines_end}]" try: code_snippet_link = self.git_provider.get_line_link(relevant_file, relevant_lines_start, relevant_lines_end) except: code_snippet_link = "" # add html table for each suggestion suggestion_content = suggestion['suggestion_content'].rstrip() CHAR_LIMIT_PER_LINE = 84 suggestion_content = insert_br_after_x_chars(suggestion_content, CHAR_LIMIT_PER_LINE) # pr_body += f"<tr><td><details><summary>{suggestion_content}</summary>" existing_code = suggestion['existing_code'].rstrip() + "\n" improved_code = suggestion['improved_code'].rstrip() + "\n" diff = difflib.unified_diff(existing_code.split('\n'), improved_code.split('\n'), n=999) patch_orig = "\n".join(diff) patch = "\n".join(patch_orig.splitlines()[5:]).strip('\n') example_code = "" example_code += f"```diff\n{patch.rstrip()}\n```\n" if i == 0: pr_body += f"""<td>\n\n""" else: pr_body += f"""<tr><td>\n\n""" suggestion_summary = suggestion['one_sentence_summary'].strip().rstrip('.') if "'<" in suggestion_summary and ">'" in suggestion_summary: # escape the '<' and '>' characters, otherwise they are interpreted as html tags get_logger().info(f"Escaped suggestion summary: {suggestion_summary}") suggestion_summary = suggestion_summary.replace("'<", "`<") suggestion_summary = suggestion_summary.replace(">'", ">`") if '`' in suggestion_summary: suggestion_summary = replace_code_tags(suggestion_summary) pr_body += f"""\n\n<details><summary>{suggestion_summary}</summary>\n\n___\n\n""" pr_body += f""" **{suggestion_content}** [{relevant_file} {range_str}]({code_snippet_link}) {example_code.rstrip()} """ if suggestion.get('score_why'): pr_body += f"<details><summary>Suggestion importance[1-10]: {suggestion['score']}</summary>\n\n" pr_body += f"__\n\nWhy: {suggestion['score_why']}\n\n" pr_body += f"</details>" pr_body += f"</details>" # # add another column for 'score' score_int = int(suggestion.get('score', 0)) score_str = f"{score_int}" if get_settings().pr_code_suggestions.new_score_mechanism: score_str = self.get_score_str(score_int) pr_body += f"</td><td align=center>{score_str}\n\n" pr_body += f"</td></tr>" counter_suggestions += 1 # pr_body += "</details>" # pr_body += """</td></tr>""" pr_body += """</tr></tbody></table>""" return pr_body except Exception as e: get_logger().info(f"Failed to publish summarized code suggestions, error: {e}") return "" def get_score_str(self, score: int) -> str: th_high = get_settings().pr_code_suggestions.get('new_score_mechanism_th_high', 9) th_medium = get_settings().pr_code_suggestions.get('new_score_mechanism_th_medium', 7) if score >= th_high: return "High" elif score >= th_medium: return "Medium" else: # score < 7 return "Low" async def self_reflect_on_suggestions(self, suggestion_list: List, patches_diff: str, model: str, prev_suggestions_str: str = "", dedicated_prompt: str = "") -> str: if not suggestion_list: return "" try: suggestion_str = "" for i, suggestion in enumerate(suggestion_list): suggestion_str += f"suggestion {i + 1}: " + str(suggestion) + '\n\n' variables = {'suggestion_list': suggestion_list, 'suggestion_str': suggestion_str, "diff": patches_diff, 'num_code_suggestions': len(suggestion_list), 'prev_suggestions_str': prev_suggestions_str, "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), 'duplicate_prompt_examples': get_settings().config.get('duplicate_prompt_examples', False)} environment = Environment(undefined=StrictUndefined) if dedicated_prompt: system_prompt_reflect = environment.from_string( get_settings().get(dedicated_prompt).system).render(variables) user_prompt_reflect = environment.from_string( get_settings().get(dedicated_prompt).user).render(variables) else: system_prompt_reflect = environment.from_string( get_settings().pr_code_suggestions_reflect_prompt.system).render(variables) user_prompt_reflect = environment.from_string( get_settings().pr_code_suggestions_reflect_prompt.user).render(variables) with get_logger().contextualize(command="self_reflect_on_suggestions"): response_reflect, finish_reason_reflect = await self.ai_handler.chat_completion(model=model, system=system_prompt_reflect, temperature=get_settings().config.temperature, user=user_prompt_reflect) except Exception as e: get_logger().info(f"Could not reflect on suggestions, error: {e}") return "" return response_reflect ================================================ FILE: pr_agent/tools/pr_config.py ================================================ from dynaconf import Dynaconf from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.log import get_logger class PRConfig: """ The PRConfig class is responsible for listing all configuration options available for the user. """ def __init__(self, pr_url: str, args=None, ai_handler=None): """ Initialize the PRConfig object with the necessary attributes and objects to comment on a pull request. Args: pr_url (str): The URL of the pull request to be reviewed. args (list, optional): List of arguments passed to the PRReviewer class. Defaults to None. """ self.git_provider = get_git_provider()(pr_url) async def run(self): get_logger().info('Getting configuration settings...') get_logger().info('Preparing configs...') pr_comment = self._prepare_pr_configs() if get_settings().config.publish_output: get_logger().info('Pushing configs...') self.git_provider.publish_comment(pr_comment) self.git_provider.remove_initial_comment() return "" def _prepare_pr_configs(self) -> str: try: conf_file = get_settings().find_file("configuration.toml") dynconf_kwargs = {'core_loaders': [], # DISABLE default loaders, otherwise will load toml files more than once. 'loaders': ['pr_agent.custom_merge_loader'], # Use a custom loader to merge sections, but overwrite their overlapping values. Do not use ENV variables. 'merge_enabled': True # Merge multiple TOML files; prevent full section overwrite—only overlapping keys in sections overwrite prior ones. } conf_settings = Dynaconf(settings_files=[conf_file], # Security: Disable all dynamic loading features load_dotenv=False, # Don't load .env files envvar_prefix=False, **dynconf_kwargs ) except Exception as e: get_logger().error("Caught exception during Dynaconf loading. Returning empty dict", artifact={"exception": e}) conf_settings = {} configuration_headers = [header.lower() for header in conf_settings.keys()] relevant_configs = { header: configs for header, configs in get_settings().to_dict().items() if (header.lower().startswith("pr_") or header.lower().startswith("config")) and header.lower() in configuration_headers } skip_keys = ['ai_disclaimer', 'ai_disclaimer_title', 'ANALYTICS_FOLDER', 'secret_provider', "skip_keys", "app_id", "redirect", 'trial_prefix_message', 'no_eligible_message', 'identity_provider', 'ALLOWED_REPOS', 'APP_NAME', 'PERSONAL_ACCESS_TOKEN', 'shared_secret', 'key', 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'user_token', 'private_key', 'private_key_id', 'client_id', 'client_secret', 'token', 'bearer_token', 'jira_api_token','webhook_secret'] partial_skip_keys = ['key', 'secret', 'token', 'private'] extra_skip_keys = get_settings().config.get('config.skip_keys', []) if extra_skip_keys: skip_keys.extend(extra_skip_keys) skip_keys_lower = [key.lower() for key in skip_keys] markdown_text = "<details> <summary><strong>🛠️ PR-Agent Configurations:</strong></summary> \n\n" markdown_text += f"\n\n```yaml\n\n" for header, configs in relevant_configs.items(): if configs: markdown_text += "\n\n" markdown_text += f"==================== {header} ====================" for key, value in configs.items(): if key.lower() in skip_keys_lower: continue if any(skip_key in key.lower() for skip_key in partial_skip_keys): continue markdown_text += f"\n{header.lower()}.{key.lower()} = {repr(value) if isinstance(value, str) else value}" markdown_text += " " markdown_text += "\n```" markdown_text += "\n</details>\n" get_logger().info(f"Possible Configurations outputted to PR comment", artifact=markdown_text) return markdown_text ================================================ FILE: pr_agent/tools/pr_description.py ================================================ import asyncio import copy import re import traceback from functools import partial from typing import List, Tuple import yaml from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import (OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD, get_pr_diff, get_pr_diff_multiple_patchs, retry_with_fallback_models) from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import (ModelType, PRDescriptionHeader, clip_tokens, get_max_tokens, get_user_labels, load_yaml, set_custom_labels, show_relevant_configurations) from pr_agent.config_loader import get_settings from pr_agent.git_providers import (GithubProvider, get_git_provider, get_git_provider_with_context) from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage from pr_agent.tools.ticket_pr_compliance_check import ( extract_and_cache_pr_tickets, extract_ticket_links_from_pr_description, extract_tickets) class PRDescription: def __init__(self, pr_url: str, args: list = None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): """ Initialize the PRDescription object with the necessary attributes and objects for generating a PR description using an AI model. Args: pr_url (str): The URL of the pull request. args (list, optional): List of arguments passed to the PRDescription class. Defaults to None. """ # Initialize the git provider and main PR language self.git_provider = get_git_provider_with_context(pr_url) self.main_pr_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.pr_id = self.git_provider.get_pr_id() self.keys_fix = ["filename:", "language:", "changes_summary:", "changes_title:", "description:", "title:"] if get_settings().pr_description.enable_semantic_files_types and not self.git_provider.is_supported( "gfm_markdown"): get_logger().debug(f"Disabling semantic files types for {self.pr_id}, gfm_markdown not supported.") get_settings().pr_description.enable_semantic_files_types = False # Initialize the AI handler self.ai_handler = ai_handler() self.ai_handler.main_pr_language = self.main_pr_language # Initialize the variables dictionary self.COLLAPSIBLE_FILE_LIST_THRESHOLD = get_settings().pr_description.get("collapsible_file_list_threshold", 8) enable_pr_diagram = get_settings().pr_description.get("enable_pr_diagram", False) and self.git_provider.is_supported("gfm_markdown") # github and gitlab support gfm_markdown self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), "description": self.git_provider.get_pr_description(full=False), "language": self.main_pr_language, "diff": "", # empty diff for initial calculation "extra_instructions": get_settings().pr_description.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), "enable_custom_labels": get_settings().config.enable_custom_labels, "custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function "enable_semantic_files_types": get_settings().pr_description.enable_semantic_files_types, "related_tickets": "", "include_file_summary_changes": len(self.git_provider.get_diff_files()) <= self.COLLAPSIBLE_FILE_LIST_THRESHOLD, "duplicate_prompt_examples": get_settings().config.get("duplicate_prompt_examples", False), "enable_pr_diagram": enable_pr_diagram, } self.user_description = self.git_provider.get_user_description() # Initialize the token handler self.token_handler = TokenHandler( self.git_provider.pr, self.vars, get_settings().pr_description_prompt.system, get_settings().pr_description_prompt.user, ) # Initialize patches_diff and prediction attributes self.patches_diff = None self.prediction = None self.file_label_dict = None async def run(self): try: get_logger().info(f"Generating a PR description for pr_id: {self.pr_id}") relevant_configs = {'pr_description': dict(get_settings().pr_description), 'config': dict(get_settings().config)} get_logger().debug("Relevant configs", artifact=relevant_configs) if get_settings().config.publish_output and not get_settings().config.get('is_auto_command', False): self.git_provider.publish_comment("Preparing PR description...", is_temporary=True) # ticket extraction if exists await extract_and_cache_pr_tickets(self.git_provider, self.vars) await retry_with_fallback_models(self._prepare_prediction, ModelType.WEAK) if self.prediction: self._prepare_data() else: get_logger().warning(f"Empty prediction, PR: {self.pr_id}") self.git_provider.remove_initial_comment() return None if get_settings().pr_description.enable_semantic_files_types: self.file_label_dict = self._prepare_file_labels() pr_labels, pr_file_changes = [], [] if get_settings().pr_description.publish_labels: pr_labels = self._prepare_labels() else: get_logger().debug(f"Publishing labels disabled") if get_settings().pr_description.use_description_markers: pr_title, pr_body, changes_walkthrough, pr_file_changes = self._prepare_pr_answer_with_markers() else: pr_title, pr_body, changes_walkthrough, pr_file_changes = self._prepare_pr_answer() if not self.git_provider.is_supported( "publish_file_comments") or not get_settings().pr_description.inline_file_summary: pr_body += "\n\n" + changes_walkthrough + "___\n\n" get_logger().debug("PR output", artifact={"title": pr_title, "body": pr_body}) # Add help text if gfm_markdown is supported if self.git_provider.is_supported("gfm_markdown") and get_settings().pr_description.enable_help_text: pr_body += "<hr>\n\n<details> <summary><strong>✨ Describe tool usage guide:</strong></summary><hr> \n\n" pr_body += HelpMessage.get_describe_usage_guide() pr_body += "\n</details>\n" elif get_settings().pr_description.enable_help_comment and self.git_provider.is_supported("gfm_markdown"): if isinstance(self.git_provider, GithubProvider): pr_body += ('\n\n___\n\n> <details> <summary> Need help?</summary><li>Type <code>/help how to ...</code> ' 'in the comments thread for any questions about PR-Agent usage.</li><li>Check out the ' '<a href="https://qodo-merge-docs.qodo.ai/usage-guide/">documentation</a> ' 'for more information.</li></details>') else: # gitlab pr_body += ("\n\n___\n\n<details><summary>Need help?</summary>- Type <code>/help how to ...</code> in the comments " "thread for any questions about PR-Agent usage.<br>- Check out the " "<a href='https://qodo-merge-docs.qodo.ai/usage-guide/'>documentation</a> for more information.</details>") # elif get_settings().pr_description.enable_help_comment: # pr_body += '\n\n___\n\n> 💡 **PR-Agent usage**: Comment `/help "your question"` on any pull request to receive relevant information' # Output the relevant configurations if enabled if get_settings().get('config', {}).get('output_relevant_configurations', False): pr_body += show_relevant_configurations(relevant_section='pr_description') if get_settings().config.publish_output: # publish labels if get_settings().pr_description.publish_labels and pr_labels and self.git_provider.is_supported("get_labels"): original_labels = self.git_provider.get_pr_labels(update=True) get_logger().debug(f"original labels", artifact=original_labels) user_labels = get_user_labels(original_labels) new_labels = pr_labels + user_labels get_logger().debug(f"published labels", artifact=new_labels) if set(new_labels) != set(original_labels): get_logger().info(f"Setting describe labels:\n{new_labels}") self.git_provider.publish_labels(new_labels) else: get_logger().debug(f"Labels are the same, not updating") # publish description if get_settings().pr_description.publish_description_as_comment: full_markdown_description = f"## Title\n\n{pr_title.strip()}\n\n___\n{pr_body}" if get_settings().pr_description.publish_description_as_comment_persistent: self.git_provider.publish_persistent_comment(full_markdown_description, initial_header="## Title", update_header=True, name="describe", final_update_message=False, ) else: self.git_provider.publish_comment(full_markdown_description) else: self.git_provider.publish_description(pr_title.strip(), pr_body) # publish final update message if (get_settings().pr_description.final_update_message and not get_settings().config.get('is_auto_command', False)): latest_commit_url = self.git_provider.get_latest_commit_url() if latest_commit_url: pr_url = self.git_provider.get_pr_url() update_comment = f"**[PR Description]({pr_url})** updated to latest commit ({latest_commit_url})" self.git_provider.publish_comment(update_comment) self.git_provider.remove_initial_comment() else: get_logger().info('PR description, but not published since publish_output is False.') get_settings().data = {"artifact": pr_body} return except Exception as e: get_logger().error(f"Error generating PR description {self.pr_id}: {e}", artifact={"traceback": traceback.format_exc()}) return "" async def _prepare_prediction(self, model: str) -> None: if get_settings().pr_description.use_description_markers and 'pr_agent:' not in self.user_description: get_logger().info("Markers were enabled, but user description does not contain markers. Skipping AI prediction") return None large_pr_handling = get_settings().pr_description.enable_large_pr_handling and "pr_description_only_files_prompts" in get_settings() output = get_pr_diff(self.git_provider, self.token_handler, model, large_pr_handling=large_pr_handling, return_remaining_files=True) if isinstance(output, tuple): patches_diff, remaining_files_list = output else: patches_diff = output remaining_files_list = [] if not large_pr_handling or patches_diff: self.patches_diff = patches_diff if patches_diff: # generate the prediction get_logger().debug(f"PR diff", artifact=self.patches_diff) self.prediction = await self._get_prediction(model, patches_diff, prompt="pr_description_prompt") # extend the prediction with additional files not shown if get_settings().pr_description.enable_semantic_files_types: self.prediction = await self.extend_uncovered_files(self.prediction) else: get_logger().error(f"Error getting PR diff {self.pr_id}", artifact={"traceback": traceback.format_exc()}) self.prediction = None else: # get the diff in multiple patches, with the token handler only for the files prompt get_logger().debug('large_pr_handling for describe') token_handler_only_files_prompt = TokenHandler( self.git_provider.pr, self.vars, get_settings().pr_description_only_files_prompts.system, get_settings().pr_description_only_files_prompts.user, ) (patches_compressed_list, total_tokens_list, deleted_files_list, remaining_files_list, file_dict, files_in_patches_list) = get_pr_diff_multiple_patchs( self.git_provider, token_handler_only_files_prompt, model) # get the files prediction for each patch if not get_settings().pr_description.async_ai_calls: results = [] for i, patches in enumerate(patches_compressed_list): # sync calls patches_diff = "\n".join(patches) get_logger().debug(f"PR diff number {i + 1} for describe files") prediction_files = await self._get_prediction(model, patches_diff, prompt="pr_description_only_files_prompts") results.append(prediction_files) else: # async calls tasks = [] for i, patches in enumerate(patches_compressed_list): if patches: patches_diff = "\n".join(patches) get_logger().debug(f"PR diff number {i + 1} for describe files") task = asyncio.create_task( self._get_prediction(model, patches_diff, prompt="pr_description_only_files_prompts")) tasks.append(task) # Wait for all tasks to complete results = await asyncio.gather(*tasks) file_description_str_list = [] for i, result in enumerate(results): prediction_files = result.strip().removeprefix('```yaml').strip('`').strip() if load_yaml(prediction_files, keys_fix_yaml=self.keys_fix) and prediction_files.startswith('pr_files'): prediction_files = prediction_files.removeprefix('pr_files:').strip() file_description_str_list.append(prediction_files) else: get_logger().debug(f"failed to generate predictions in iteration {i + 1} for describe files") # generate files_walkthrough string, with proper token handling token_handler_only_description_prompt = TokenHandler( self.git_provider.pr, self.vars, get_settings().pr_description_only_description_prompts.system, get_settings().pr_description_only_description_prompts.user) files_walkthrough = "\n".join(file_description_str_list) files_walkthrough_prompt = copy.deepcopy(files_walkthrough) MAX_EXTRA_FILES_TO_PROMPT = 50 if remaining_files_list: files_walkthrough_prompt += "\n\nNo more token budget. Additional unprocessed files:" for i, file in enumerate(remaining_files_list): files_walkthrough_prompt += f"\n- {file}" if i >= MAX_EXTRA_FILES_TO_PROMPT: get_logger().debug(f"Too many remaining files, clipping to {MAX_EXTRA_FILES_TO_PROMPT}") files_walkthrough_prompt += f"\n... and {len(remaining_files_list) - MAX_EXTRA_FILES_TO_PROMPT} more" break if deleted_files_list: files_walkthrough_prompt += "\n\nAdditional deleted files:" for i, file in enumerate(deleted_files_list): files_walkthrough_prompt += f"\n- {file}" if i >= MAX_EXTRA_FILES_TO_PROMPT: get_logger().debug(f"Too many deleted files, clipping to {MAX_EXTRA_FILES_TO_PROMPT}") files_walkthrough_prompt += f"\n... and {len(deleted_files_list) - MAX_EXTRA_FILES_TO_PROMPT} more" break tokens_files_walkthrough = len( token_handler_only_description_prompt.encoder.encode(files_walkthrough_prompt)) total_tokens = token_handler_only_description_prompt.prompt_tokens + tokens_files_walkthrough max_tokens_model = get_max_tokens(model) if total_tokens > max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD: # clip files_walkthrough to git the tokens within the limit files_walkthrough_prompt = clip_tokens(files_walkthrough_prompt, max_tokens_model - OUTPUT_BUFFER_TOKENS_HARD_THRESHOLD - token_handler_only_description_prompt.prompt_tokens, num_input_tokens=tokens_files_walkthrough) # PR header inference get_logger().debug(f"PR diff only description", artifact=files_walkthrough_prompt) prediction_headers = await self._get_prediction(model, patches_diff=files_walkthrough_prompt, prompt="pr_description_only_description_prompts") prediction_headers = prediction_headers.strip().removeprefix('```yaml').strip('`').strip() # extend the tables with the files not shown files_walkthrough_extended = await self.extend_uncovered_files(files_walkthrough) # final processing self.prediction = prediction_headers + "\n" + "pr_files:\n" + files_walkthrough_extended if not load_yaml(self.prediction, keys_fix_yaml=self.keys_fix): get_logger().error(f"Error getting valid YAML in large PR handling for describe {self.pr_id}") if load_yaml(prediction_headers, keys_fix_yaml=self.keys_fix): get_logger().debug(f"Using only headers for describe {self.pr_id}") self.prediction = prediction_headers async def extend_uncovered_files(self, original_prediction: str) -> str: try: prediction = original_prediction # get the original prediction filenames original_prediction_loaded = load_yaml(original_prediction, keys_fix_yaml=self.keys_fix) if isinstance(original_prediction_loaded, list): original_prediction_dict = {"pr_files": original_prediction_loaded} else: original_prediction_dict = original_prediction_loaded if original_prediction_dict: files = original_prediction_dict.get('pr_files', []) filenames_predicted = [file.get('filename', '').strip() for file in files if isinstance(file, dict)] else: filenames_predicted = [] # extend the prediction with additional files not included in the original prediction pr_files = self.git_provider.get_diff_files() prediction_extra = "pr_files:" MAX_EXTRA_FILES_TO_OUTPUT = 100 counter_extra_files = 0 for file in pr_files: if file.filename in filenames_predicted: continue # add up to MAX_EXTRA_FILES_TO_OUTPUT files counter_extra_files += 1 if counter_extra_files > MAX_EXTRA_FILES_TO_OUTPUT: extra_file_yaml = f"""\ - filename: | Additional files not shown changes_title: | ... label: | additional files """ prediction_extra = prediction_extra + "\n" + extra_file_yaml.strip() get_logger().debug(f"Too many remaining files, clipping to {MAX_EXTRA_FILES_TO_OUTPUT}") break extra_file_yaml = f"""\ - filename: | {file.filename} changes_title: | ... label: | additional files """ prediction_extra = prediction_extra + "\n" + extra_file_yaml.strip() # merge the two dictionaries if counter_extra_files > 0: get_logger().info(f"Adding {counter_extra_files} unprocessed extra files to table prediction") prediction_extra_dict = load_yaml(prediction_extra, keys_fix_yaml=self.keys_fix) if original_prediction_dict and isinstance(original_prediction_dict, dict) and \ isinstance(prediction_extra_dict, dict) and "pr_files" in prediction_extra_dict: if "pr_files" in original_prediction_dict: original_prediction_dict["pr_files"].extend(prediction_extra_dict["pr_files"]) else: original_prediction_dict["pr_files"] = prediction_extra_dict["pr_files"] new_yaml = yaml.dump(original_prediction_dict) if load_yaml(new_yaml, keys_fix_yaml=self.keys_fix): prediction = new_yaml if isinstance(original_prediction, list): prediction = yaml.dump(original_prediction_dict["pr_files"]) return prediction except Exception as e: get_logger().exception(f"Error extending uncovered files {self.pr_id}", artifact={"error": e}) return original_prediction async def extend_additional_files(self, remaining_files_list) -> str: prediction = self.prediction try: original_prediction_dict = load_yaml(self.prediction, keys_fix_yaml=self.keys_fix) prediction_extra = "pr_files:" for file in remaining_files_list: extra_file_yaml = f"""\ - filename: | {file} changes_summary: | ... changes_title: | ... label: | additional files (token-limit) """ prediction_extra = prediction_extra + "\n" + extra_file_yaml.strip() prediction_extra_dict = load_yaml(prediction_extra, keys_fix_yaml=self.keys_fix) # merge the two dictionaries if isinstance(original_prediction_dict, dict) and isinstance(prediction_extra_dict, dict): original_prediction_dict["pr_files"].extend(prediction_extra_dict["pr_files"]) new_yaml = yaml.dump(original_prediction_dict) if load_yaml(new_yaml, keys_fix_yaml=self.keys_fix): prediction = new_yaml return prediction except Exception as e: get_logger().error(f"Error extending additional files {self.pr_id}: {e}") return self.prediction async def _get_prediction(self, model: str, patches_diff: str, prompt="pr_description_prompt") -> str: variables = copy.deepcopy(self.vars) variables["diff"] = patches_diff # update diff environment = Environment(undefined=StrictUndefined) set_custom_labels(variables, self.git_provider) self.variables = variables system_prompt = environment.from_string(get_settings().get(prompt, {}).get("system", "")).render(self.variables) user_prompt = environment.from_string(get_settings().get(prompt, {}).get("user", "")).render(self.variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt ) return response def _prepare_data(self): # Load the AI prediction data into a dictionary self.data = load_yaml(self.prediction.strip(), keys_fix_yaml=self.keys_fix) if get_settings().pr_description.add_original_user_description and self.user_description: self.data["User Description"] = self.user_description # re-order keys if 'User Description' in self.data: self.data['User Description'] = self.data.pop('User Description') if 'title' in self.data: self.data['title'] = self.data.pop('title') if 'type' in self.data: self.data['type'] = self.data.pop('type') if 'labels' in self.data: self.data['labels'] = self.data.pop('labels') if 'description' in self.data: self.data['description'] = self.data.pop('description') if 'changes_diagram' in self.data: changes_diagram = self.data.pop('changes_diagram').strip() if changes_diagram.startswith('```'): if not changes_diagram.endswith('```'): # fallback for missing closing changes_diagram += '\n```' self.data['changes_diagram'] = '\n'+ changes_diagram if 'pr_files' in self.data: self.data['pr_files'] = self.data.pop('pr_files') def _prepare_labels(self) -> List[str]: pr_labels = [] # If the 'PR Type' key is present in the dictionary, split its value by comma and assign it to 'pr_types' if 'labels' in self.data and self.data['labels']: if type(self.data['labels']) == list: pr_labels = self.data['labels'] elif type(self.data['labels']) == str: pr_labels = self.data['labels'].split(',') elif 'type' in self.data and self.data['type'] and get_settings().pr_description.publish_labels: if type(self.data['type']) == list: pr_labels = self.data['type'] elif type(self.data['type']) == str: pr_labels = self.data['type'].split(',') pr_labels = [label.strip() for label in pr_labels] # convert lowercase labels to original case try: if "labels_minimal_to_labels_dict" in self.variables: d: dict = self.variables["labels_minimal_to_labels_dict"] for i, label_i in enumerate(pr_labels): if label_i in d: pr_labels[i] = d[label_i] except Exception as e: get_logger().error(f"Error converting labels to original case {self.pr_id}: {e}") return pr_labels def _prepare_pr_answer_with_markers(self) -> Tuple[str, str, str, List[dict]]: get_logger().info(f"Using description marker replacements {self.pr_id}") # Remove the 'PR Title' key from the dictionary ai_title = self.data.pop('title', self.vars["title"]) if (not get_settings().pr_description.generate_ai_title): # Assign the original PR title to the 'title' variable title = self.vars["title"] else: # Assign the value of the 'PR Title' key to 'title' variable title = ai_title body = self.user_description if get_settings().pr_description.include_generated_by_header: ai_header = f"### 🤖 Generated by PR Agent at {self.git_provider.last_commit_id.sha}\n\n" else: ai_header = "" ai_type = self.data.get('type') if ai_type and not re.search(r'<!--\s*pr_agent:type\s*-->', body): if isinstance(ai_type, list): pr_type = ', '.join(str(t) for t in ai_type) else: pr_type = ai_type pr_type = f"{ai_header}{pr_type}" body = body.replace('pr_agent:type', pr_type) ai_summary = self.data.get('description') if ai_summary and not re.search(r'<!--\s*pr_agent:summary\s*-->', body): summary = f"{ai_header}{ai_summary}" body = body.replace('pr_agent:summary', summary) ai_walkthrough = self.data.get('pr_files') walkthrough_gfm = "" pr_file_changes = [] if ai_walkthrough and not re.search(r'<!--\s*pr_agent:walkthrough\s*-->', body): try: walkthrough_gfm, pr_file_changes = self.process_pr_files_prediction(walkthrough_gfm, self.file_label_dict) body = body.replace('pr_agent:walkthrough', walkthrough_gfm) except Exception as e: get_logger().error(f"Failing to process walkthrough {self.pr_id}: {e}") body = body.replace('pr_agent:walkthrough', "") # Add support for pr_agent:diagram marker (plain and HTML comment formats) ai_diagram = self.data.get('changes_diagram') if ai_diagram: body = re.sub(r'<!--\s*pr_agent:diagram\s*-->|pr_agent:diagram', ai_diagram, body) return title, body, walkthrough_gfm, pr_file_changes def _prepare_pr_answer(self) -> Tuple[str, str, str, List[dict]]: """ Prepare the PR description based on the AI prediction data. Returns: - title: a string containing the PR title. - pr_body: a string containing the PR description body in a markdown format. """ # Iterate over the dictionary items and append the key and value to 'markdown_text' in a markdown format # Don't display 'PR Labels' if 'labels' in self.data and self.git_provider.is_supported("get_labels"): self.data.pop('labels') if not get_settings().pr_description.enable_pr_type: self.data.pop('type') # Remove the 'PR Title' key from the dictionary ai_title = self.data.pop('title', self.vars["title"]) if (not get_settings().pr_description.generate_ai_title): # Assign the original PR title to the 'title' variable title = self.vars["title"] else: # Assign the value of the 'PR Title' key to 'title' variable title = ai_title # Iterate over the remaining dictionary items and append the key and value to 'pr_body' in a markdown format, # except for the items containing the word 'walkthrough' pr_body, changes_walkthrough = "", "" pr_file_changes = [] for idx, (key, value) in enumerate(self.data.items()): if key == 'changes_diagram': pr_body += f"### {PRDescriptionHeader.DIAGRAM_WALKTHROUGH.value}\n\n" pr_body += f"{value}\n\n" continue if key == 'pr_files': value = self.file_label_dict else: key_publish = key.rstrip(':').replace("_", " ").capitalize() if key_publish == "Type": key_publish = "PR Type" # elif key_publish == "Description": # key_publish = "PR Description" pr_body += f"### **{key_publish}**\n" if 'walkthrough' in key.lower(): if self.git_provider.is_supported("gfm_markdown"): pr_body += "<details> <summary>files:</summary>\n\n" for file in value: filename = file['filename'].replace("'", "`") description = file['changes_in_file'] pr_body += f'- `{filename}`: {description}\n' if self.git_provider.is_supported("gfm_markdown"): pr_body += "</details>\n" elif 'pr_files' in key.lower() and get_settings().pr_description.enable_semantic_files_types: # 'File Walkthrough' section changes_walkthrough_table, pr_file_changes = self.process_pr_files_prediction(changes_walkthrough, value) if get_settings().pr_description.get('file_table_collapsible_open_by_default', False): initial_status = " open" else: initial_status = "" changes_walkthrough = f"<details{initial_status}> <summary><h3> {PRDescriptionHeader.FILE_WALKTHROUGH.value}</h3></summary>\n\n" changes_walkthrough += f"{changes_walkthrough_table}\n\n" changes_walkthrough += "</details>\n\n" elif key.lower().strip() == 'description': if isinstance(value, list): value = ', '.join(v.rstrip() for v in value) value = value.replace('\n-', '\n\n-').strip() # makes the bullet points more readable by adding double space pr_body += f"{value}\n" else: # if the value is a list, join its items by comma if isinstance(value, list): value = ', '.join(v.rstrip() for v in value) pr_body += f"{value}\n" if idx < len(self.data) - 1: pr_body += "\n\n___\n\n" return title, pr_body, changes_walkthrough, pr_file_changes, def _prepare_file_labels(self): file_label_dict = {} if (not self.data or not isinstance(self.data, dict) or 'pr_files' not in self.data or not self.data['pr_files']): return file_label_dict for file in self.data['pr_files']: try: required_fields = ['changes_title', 'filename', 'label'] if not all(field in file for field in required_fields): # can happen for example if a YAML generation was interrupted in the middle (no more tokens) get_logger().warning(f"Missing required fields in file label dict {self.pr_id}, skipping file", artifact={"file": file}) continue if not file.get('changes_title'): get_logger().warning(f"Empty changes title or summary in file label dict {self.pr_id}, skipping file", artifact={"file": file}) continue filename = file['filename'].replace("'", "`").replace('"', '`') changes_summary = file.get('changes_summary', "") if not changes_summary and self.vars.get('include_file_summary_changes', True): get_logger().warning(f"Empty changes summary in file label dict, skipping file", artifact={"file": file}) continue changes_summary = changes_summary.strip() changes_title = file['changes_title'].strip() label = file.get('label').strip().lower() if label not in file_label_dict: file_label_dict[label] = [] file_label_dict[label].append((filename, changes_title, changes_summary)) except Exception as e: get_logger().exception(f"Error preparing file label dict {self.pr_id}") pass return file_label_dict def process_pr_files_prediction(self, pr_body, value): pr_comments = [] # logic for using collapsible file list use_collapsible_file_list = get_settings().pr_description.collapsible_file_list num_files = 0 if value: for semantic_label in value.keys(): num_files += len(value[semantic_label]) if use_collapsible_file_list == "adaptive": use_collapsible_file_list = num_files > self.COLLAPSIBLE_FILE_LIST_THRESHOLD if not self.git_provider.is_supported("gfm_markdown"): return pr_body, pr_comments try: pr_body += "<table>" header = f"Relevant files" delta = 75 # header += "  " * delta pr_body += f"""<thead><tr><th></th><th align="left">{header}</th></tr></thead>""" pr_body += """<tbody>""" for semantic_label in value.keys(): s_label = semantic_label.strip("'").strip('"') pr_body += f"""<tr><td><strong>{s_label.capitalize()}</strong></td>""" list_tuples = value[semantic_label] if use_collapsible_file_list: pr_body += f"""<td><details><summary>{len(list_tuples)} files</summary><table>""" else: pr_body += f"""<td><table>""" for filename, file_changes_title, file_change_description in list_tuples: filename = filename.replace("'", "`").rstrip() filename_publish = filename.split("/")[-1] if file_changes_title and file_changes_title.strip() != "...": file_changes_title_code = f"<code>{file_changes_title}</code>" file_changes_title_code_br = insert_br_after_x_chars(file_changes_title_code, x=(delta - 5)).strip() if len(file_changes_title_code_br) < (delta - 5): file_changes_title_code_br += "  " * ((delta - 5) - len(file_changes_title_code_br)) filename_publish = f"<strong>{filename_publish}</strong><dd>{file_changes_title_code_br}</dd>" else: filename_publish = f"<strong>{filename_publish}</strong>" diff_plus_minus = "" delta_nbsp = "" diff_files = self.git_provider.get_diff_files() for f in diff_files: if f.filename.lower().strip('/') == filename.lower().strip('/'): num_plus_lines = f.num_plus_lines num_minus_lines = f.num_minus_lines diff_plus_minus += f"+{num_plus_lines}/-{num_minus_lines}" if len(diff_plus_minus) > 12 or diff_plus_minus == "+0/-0": diff_plus_minus = "[link]" delta_nbsp = "  " * max(0, (8 - len(diff_plus_minus))) break # try to add line numbers link to code suggestions link = "" if hasattr(self.git_provider, 'get_line_link'): filename = filename.strip() link = self.git_provider.get_line_link(filename, relevant_line_start=-1) if (not link or not diff_plus_minus) and ('additional files' not in filename.lower()): # get_logger().warning(f"Error getting line link for '{filename}'") link = "" # continue # Add file data to the PR body file_change_description_br = insert_br_after_x_chars(file_change_description, x=(delta - 5)) pr_body = self.add_file_data(delta_nbsp, diff_plus_minus, file_change_description_br, filename, filename_publish, link, pr_body) # Close the collapsible file list if use_collapsible_file_list: pr_body += """</table></details></td></tr>""" else: pr_body += """</table></td></tr>""" pr_body += """</tr></tbody></table>""" except Exception as e: get_logger().error(f"Error processing pr files to markdown {self.pr_id}: {str(e)}") pass return pr_body, pr_comments def add_file_data(self, delta_nbsp, diff_plus_minus, file_change_description_br, filename, filename_publish, link, pr_body) -> str: if not file_change_description_br: pr_body += f""" <tr> <td>{filename_publish}</td> <td><a href="{link}">{diff_plus_minus}</a>{delta_nbsp}</td> </tr> """ else: pr_body += f""" <tr> <td> <details> <summary>{filename_publish}</summary> <hr> {filename} {file_change_description_br} </details> </td> <td><a href="{link}">{diff_plus_minus}</a>{delta_nbsp}</td> </tr> """ return pr_body def count_chars_without_html(string): if '<' not in string: return len(string) no_html_string = re.sub('<[^>]+>', '', string) return len(no_html_string) def insert_br_after_x_chars(text: str, x=70): """ Insert <br> into a string after a word that increases its length above x characters. Use proper HTML tags for code and new lines. """ if not text: return "" if count_chars_without_html(text) < x: return text is_list = text.lstrip().startswith(("- ", "* ")) # replace odd instances of ` with <code> and even instances of ` with </code> text = replace_code_tags(text) # convert list items to <li> only if the text is identified as a list if is_list: # To handle lists that start with indentation leading_whitespace = text[:len(text) - len(text.lstrip())] body = text.lstrip() body = "<li>" + body[2:] text = leading_whitespace + body text = text.replace("\n- ", '<br><li> ').replace("\n - ", '<br><li> ') text = text.replace("\n* ", '<br><li> ').replace("\n * ", '<br><li> ') # convert new lines to <br> text = text.replace("\n", '<br>') # split text into lines lines = text.split('<br>') words = [] for i, line in enumerate(lines): words += line.split(' ') if i < len(lines) - 1: words[-1] += "<br>" new_text = [] is_inside_code = False current_length = 0 for word in words: is_saved_word = False if word == "<code>" or word == "</code>" or word == "<li>" or word == "<br>": is_saved_word = True len_word = count_chars_without_html(word) if not is_saved_word and (current_length + len_word > x): if is_inside_code: new_text.append("</code><br><code>") else: new_text.append("<br>") current_length = 0 # Reset counter new_text.append(word + " ") if not is_saved_word: current_length += len_word + 1 # Add 1 for the space if word == "<li>" or word == "<br>": current_length = 0 if "<code>" in word: is_inside_code = True if "</code>" in word: is_inside_code = False processed_text = ''.join(new_text).strip() if is_list: processed_text = f"<ul>{processed_text}</ul>" return processed_text def replace_code_tags(text): """ Replace odd instances of ` with <code> and even instances of ` with </code> """ parts = text.split('`') for i in range(1, len(parts), 2): parts[i] = '<code>' + parts[i] + '</code>' return ''.join(parts) ================================================ FILE: pr_agent/tools/pr_generate_labels.py ================================================ import copy import re from functools import partial from typing import List, Tuple from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import get_user_labels, load_yaml, set_custom_labels from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.log import get_logger class PRGenerateLabels: def __init__(self, pr_url: str, args: list = None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): """ Initialize the PRGenerateLabels object with the necessary attributes and objects for generating labels corresponding to the PR using an AI model. Args: pr_url (str): The URL of the pull request. args (list, optional): List of arguments passed to the PRGenerateLabels class. Defaults to None. """ # Initialize the git provider and main PR language self.git_provider = get_git_provider()(pr_url) self.main_pr_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.pr_id = self.git_provider.get_pr_id() # Initialize the AI handler self.ai_handler = ai_handler() self.ai_handler.main_pr_language = self.main_pr_language # Initialize the variables dictionary self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), "description": self.git_provider.get_pr_description(full=False), "language": self.main_pr_language, "diff": "", # empty diff for initial calculation "extra_instructions": get_settings().pr_description.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), "enable_custom_labels": get_settings().config.enable_custom_labels, "custom_labels_class": "", # will be filled if necessary in 'set_custom_labels' function } # Initialize the token handler self.token_handler = TokenHandler( self.git_provider.pr, self.vars, get_settings().pr_custom_labels_prompt.system, get_settings().pr_custom_labels_prompt.user, ) # Initialize patches_diff and prediction attributes self.patches_diff = None self.prediction = None async def run(self): """ Generates a PR labels using an AI model and publishes it to the PR. """ try: get_logger().info(f"Generating a PR labels {self.pr_id}") if get_settings().config.publish_output: self.git_provider.publish_comment("Preparing PR labels...", is_temporary=True) await retry_with_fallback_models(self._prepare_prediction) get_logger().info(f"Preparing answer {self.pr_id}") if self.prediction: self._prepare_data() else: return None pr_labels = self._prepare_labels() if get_settings().config.publish_output: get_logger().info(f"Pushing labels {self.pr_id}") current_labels = self.git_provider.get_pr_labels() user_labels = get_user_labels(current_labels) pr_labels = pr_labels + user_labels if self.git_provider.is_supported("get_labels"): self.git_provider.publish_labels(pr_labels) elif pr_labels: value = ', '.join(v for v in pr_labels) pr_labels_text = f"## PR Labels:\n{value}\n" self.git_provider.publish_comment(pr_labels_text, is_temporary=False) self.git_provider.remove_initial_comment() except Exception as e: get_logger().error(f"Error generating PR labels {self.pr_id}: {e}") return "" async def _prepare_prediction(self, model: str) -> None: """ Prepare the AI prediction for the PR labels based on the provided model. Args: model (str): The name of the model to be used for generating the prediction. Returns: None Raises: Any exceptions raised by the 'get_pr_diff' and '_get_prediction' functions. """ get_logger().info(f"Getting PR diff {self.pr_id}") self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) get_logger().info(f"Getting AI prediction {self.pr_id}") self.prediction = await self._get_prediction(model) async def _get_prediction(self, model: str) -> str: """ Generate an AI prediction for the PR labels based on the provided model. Args: model (str): The name of the model to be used for generating the prediction. Returns: str: The generated AI prediction. """ variables = copy.deepcopy(self.vars) variables["diff"] = self.patches_diff # update diff environment = Environment(undefined=StrictUndefined) set_custom_labels(variables, self.git_provider) self.variables = variables system_prompt = environment.from_string(get_settings().pr_custom_labels_prompt.system).render(self.variables) user_prompt = environment.from_string(get_settings().pr_custom_labels_prompt.user).render(self.variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt ) return response def _prepare_data(self): # Load the AI prediction data into a dictionary self.data = load_yaml(self.prediction.strip()) def _prepare_labels(self) -> List[str]: pr_types = [] # If the 'labels' key is present in the dictionary, split its value by comma and assign it to 'pr_types' if 'labels' in self.data: if type(self.data['labels']) == list: pr_types = self.data['labels'] elif type(self.data['labels']) == str: pr_types = self.data['labels'].split(',') pr_types = [label.strip() for label in pr_types] # convert lowercase labels to original case try: if "labels_minimal_to_labels_dict" in self.variables: d: dict = self.variables["labels_minimal_to_labels_dict"] for i, label_i in enumerate(pr_types): if label_i in d: pr_types[i] = d[label_i] except Exception as e: get_logger().error(f"Error converting labels to original case {self.pr_id}: {e}") return pr_types ================================================ FILE: pr_agent/tools/pr_help_docs.py ================================================ import copy from functools import partial from jinja2 import Environment, StrictUndefined import math import os import re from tempfile import TemporaryDirectory from pr_agent.algo import MAX_TOKENS from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import clip_tokens, get_max_tokens, load_yaml, ModelType from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider_with_context from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage #Common code that can be called from similar tools: def modify_answer_section(ai_response: str) -> str | None: # Gets the model's answer and relevant sources section, replacing the heading of the answer section with: # :bulb: Auto-generated documentation-based answer: """ For example: The following input: ### Question: \nThe following general issue was asked by a user: Title: How does one request to re-review a PR? More Info: I cannot seem to find to do this. ### Answer:\nAccording to the documentation, one needs to invoke the command: /review #### Relevant Sources... Should become: ### :bulb: Auto-generated documentation-based answer:\n According to the documentation, one needs to invoke the command: /review #### Relevant Sources... """ model_answer_and_relevant_sections_in_response \ = extract_model_answer_and_relevant_sources(ai_response) if model_answer_and_relevant_sections_in_response is not None: cleaned_question_with_answer = "### :bulb: Auto-generated documentation-based answer:\n" cleaned_question_with_answer += model_answer_and_relevant_sections_in_response return cleaned_question_with_answer get_logger().warning(f"Either no answer section found, or that section is malformed: {ai_response}") return None def extract_model_answer_and_relevant_sources(ai_response: str) -> str | None: # It is assumed that the input contains several sections with leading "### ", # where the answer is the last one of them having the format: "### Answer:\n"), since the model returns the answer # AFTER the user question. By splitting using the string: "### Answer:\n" and grabbing the last part, # the model answer is guaranteed to be in that last part, provided it is followed by a "#### Relevant Sources:\n\n". # (for more details, see here: https://github.com/Codium-ai/pr-agent-pro/blob/main/pr_agent/tools/pr_help_message.py#L173) """ For example: ### Question: \nHow does one request to re-review a PR?\n\n ### Answer:\nAccording to the documentation, one needs to invoke the command: /review\n\n #### Relevant Sources:\n\n... The answer part is: "According to the documentation, one needs to invoke the command: /review\n\n" followed by "Relevant Sources:\n\n". """ if "### Answer:\n" in ai_response: model_answer_and_relevant_sources_sections_in_response = ai_response.split("### Answer:\n")[-1] # Split such part by "Relevant Sources" section to contain only the model answer: if "#### Relevant Sources:\n\n" in model_answer_and_relevant_sources_sections_in_response: model_answer_section_in_response \ = model_answer_and_relevant_sources_sections_in_response.split("#### Relevant Sources:\n\n")[0] get_logger().info(f"Found model answer: {model_answer_section_in_response}") return model_answer_and_relevant_sources_sections_in_response \ if len(model_answer_section_in_response) > 0 else None get_logger().warning(f"Either no answer section found, or that section is malformed: {ai_response}") return None def get_maximal_text_input_length_for_token_count_estimation(): model = get_settings().config.model if 'claude-3-7-sonnet' in model.lower(): return 900000 #Claude API for token estimation allows maximal text input of 900K chars return math.inf #Otherwise, no known limitation on input text just for token estimation def return_document_headings(text: str, ext: str) -> str: try: lines = text.split('\n') headings = set() if not text or not re.search(r'[a-zA-Z]', text): get_logger().error(f"Empty or non text content found in text: {text}.") return "" if ext in ['.md', '.mdx']: # Extract Markdown headings (lines starting with #) headings = {line.strip() for line in lines if line.strip().startswith('#')} elif ext == '.rst': # Find indices of lines that have all same character: #Allowed characters according to list from: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#sections section_chars = set('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~') # Find potential section marker lines (underlines/overlines): They have to be the same character marker_lines = [] for i, line in enumerate(lines): line = line.rstrip() if line and all(c == line[0] for c in line) and line[0] in section_chars: marker_lines.append((i, len(line))) # Check for headings adjacent to marker lines (below + text must be in length equal or less) for idx, length in marker_lines: # Check if it's an underline (heading is above it) if idx > 0 and lines[idx - 1].rstrip() and len(lines[idx - 1].rstrip()) <= length: headings.add(lines[idx - 1].rstrip()) else: get_logger().error(f"Unsupported file extension: {ext}") return "" return '\n'.join(headings) except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty result.") return "" # Load documentation files to memory: full file path (as will be given as prompt) -> doc contents def map_documentation_files_to_contents(base_path: str, doc_files: list[str], max_allowed_file_len=5000) -> dict[str, str]: try: returned_dict = {} for file in doc_files: try: with open(file, 'r', encoding='utf-8') as f: content = f.read() # Skip files with no text content if not re.search(r'[a-zA-Z]', content): continue if len(content) > max_allowed_file_len: get_logger().warning(f"File {file} length: {len(content)} exceeds limit: {max_allowed_file_len}, so it will be trimmed.") content = content[:max_allowed_file_len] file_path = str(file).replace(str(base_path), '') returned_dict[file_path] = content.strip() except Exception as e: get_logger().warning(f"Error while reading the file {file}: {e}") continue if not returned_dict: get_logger().error("Couldn't find any usable documentation files. Returning empty dict.") return returned_dict except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty dict.") return {} # Goes over files' contents, generating payload for prompt while decorating them with a header to mark where each file begins, # as to help the LLM to give a better answer. def aggregate_documentation_files_for_prompt_contents(file_path_to_contents: dict[str, str], return_just_headings=False) -> str: try: docs_prompt = "" for idx, file_path in enumerate(file_path_to_contents): file_contents = file_path_to_contents[file_path].strip() if not file_contents: get_logger().error(f"Got empty file contents for: {file_path}. Skipping this file.") continue if return_just_headings: file_headings = return_document_headings(file_contents, os.path.splitext(file_path)[-1]).strip() if file_headings: docs_prompt += f"\n==file name==\n\n{file_path}\n\n==index==\n\n{idx}\n\n==file headings==\n\n{file_headings}\n=========\n\n" else: get_logger().warning(f"No headers for: {file_path}. Will only use filename") docs_prompt += f"\n==file name==\n\n{file_path}\n\n==index==\n\n{idx}\n\n" else: docs_prompt += f"\n==file name==\n\n{file_path}\n\n==file content==\n\n{file_contents}\n=========\n\n" return docs_prompt except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty result.") return "" def format_markdown_q_and_a_response(question_str: str, response_str: str, relevant_sections: list[dict[str, str]], supported_suffixes: list[str], base_url_prefix: str, base_url_suffix: str="") -> str: try: base_url_prefix = base_url_prefix.strip('/') #Sanitize base_url_prefix answer_str = "" answer_str += f"### Question: \n{question_str}\n\n" answer_str += f"### Answer:\n{response_str.strip()}\n\n" answer_str += f"#### Relevant Sources:\n\n" for section in relevant_sections: file = section.get('file_name').lstrip('/').strip() #Remove any '/' in the beginning, since some models do it anyway ext = [suffix for suffix in supported_suffixes if file.endswith(suffix)] if not ext: get_logger().warning(f"Unsupported file extension: {file}") continue if str(section['relevant_section_header_string']).strip(): markdown_header = format_markdown_header(section['relevant_section_header_string']) if base_url_prefix: answer_str += f"> - {base_url_prefix}/{file}{base_url_suffix}#{markdown_header}\n" else: answer_str += f"> - {base_url_prefix}/{file}{base_url_suffix}\n" return answer_str except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty result.") return "" def format_markdown_header(header: str) -> str: try: # First, strip common characters from both ends cleaned = header.strip('# 💎\n') # Define all characters to be removed/replaced in a single pass replacements = { "'": '', "`": '', '(': '', ')': '', ',': '', '.': '', '?': '', '!': '', ' ': '-' } # Compile regex pattern for characters to remove pattern = re.compile('|'.join(map(re.escape, replacements.keys()))) # Perform replacements in a single pass and convert to lowercase return pattern.sub(lambda m: replacements[m.group()], cleaned).lower() except Exception: get_logger().exception(f"Error while formatting markdown header", artifacts={'header': header}) return "" def clean_markdown_content(content: str) -> str: """ Remove hidden comments and unnecessary elements from markdown content to reduce size. Args: content: The original markdown content Returns: Cleaned markdown content """ try: # Remove HTML comments content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL) # Remove frontmatter (YAML between --- or +++ delimiters) content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL) content = re.sub(r'^\+\+\+\s*\n.*?\n\+\+\+\s*\n', '', content, flags=re.DOTALL) # Remove excessive blank lines (more than 2 consecutive) content = re.sub(r'\n{3,}', '\n\n', content) # Remove HTML tags that are often used for styling only content = re.sub(r'<div.*?>|</div>|<span.*?>|</span>', '', content, flags=re.DOTALL) # Remove image alt text which can be verbose content = re.sub(r'!\[(.*?)\]', '![]', content) # Remove images completely content = re.sub(r'!\[.*?\]\(.*?\)', '', content) # Remove simple HTML tags but preserve content between them content = re.sub(r'<(?!table|tr|td|th|thead|tbody)([a-zA-Z][a-zA-Z0-9]*)[^>]*>(.*?)</\1>', r'\2', content, flags=re.DOTALL) return content.strip() except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty result.") return "" class PredictionPreparator: def __init__(self, ai_handler, vars, system_prompt, user_prompt): try: self.ai_handler = ai_handler variables = copy.deepcopy(vars) environment = Environment(undefined=StrictUndefined) self.system_prompt = environment.from_string(system_prompt).render(variables) self.user_prompt = environment.from_string(user_prompt).render(variables) except Exception as e: get_logger().exception(f"Caught exception during init. Setting ai_handler to None to prevent __call__.") self.ai_handler = None #Called by retry_with_fallback_models and therefore, on any failure must throw an exception: async def __call__(self, model: str) -> str: if not self.ai_handler: get_logger().error("ai handler not set. Cannot invoke model!") raise ValueError("PredictionPreparator not initialized") try: response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=self.system_prompt, user=self.user_prompt) return response except Exception as e: get_logger().exception("Caught exception during prediction.", artifacts={'system': self.system_prompt, 'user': self.user_prompt}) raise e class PRHelpDocs(object): def __init__(self, ctx_url, ai_handler:partial[BaseAiHandler,] = LiteLLMAIHandler, args: tuple[str]=None, return_as_string: bool=False): try: self.ctx_url = ctx_url self.question = args[0] if args else None self.return_as_string = return_as_string self.repo_url_given_explicitly = True self.repo_url = get_settings().get('PR_HELP_DOCS.REPO_URL', '') self.repo_desired_branch = get_settings().get('PR_HELP_DOCS.REPO_DEFAULT_BRANCH', 'main') #Ignored if self.repo_url is empty self.include_root_readme_file = not(get_settings()['PR_HELP_DOCS.EXCLUDE_ROOT_README']) self.supported_doc_exts = get_settings()['PR_HELP_DOCS.SUPPORTED_DOC_EXTS'] self.docs_path = get_settings()['PR_HELP_DOCS.DOCS_PATH'] retrieved_settings = [self.include_root_readme_file, self.supported_doc_exts, self.docs_path] if any([setting is None for setting in retrieved_settings]): raise Exception(f"One of the settings is invalid: {retrieved_settings}") self.git_provider = get_git_provider_with_context(ctx_url) if not self.git_provider: raise Exception(f"No git provider found at {ctx_url}") if not self.repo_url: self.repo_url_given_explicitly = False get_logger().debug(f"No explicit repo url provided, deducing it from type: {self.git_provider.__class__.__name__} " f"context url: {self.ctx_url}") self.repo_url = self.git_provider.get_git_repo_url(self.ctx_url) if not self.repo_url: raise Exception(f"Unable to deduce repo url from type: {self.git_provider.__class__.__name__} url: {self.ctx_url}") get_logger().debug(f"deduced repo url: {self.repo_url}") self.repo_desired_branch = None #Inferred from the repo provider. self.ai_handler = ai_handler() self.vars = { "docs_url": self.repo_url, "question": self.question, "snippets": "", } self.token_handler = TokenHandler(None, self.vars, get_settings().pr_help_docs_prompts.system, get_settings().pr_help_docs_prompts.user) except Exception as e: get_logger().exception(f"Caught exception during init. Setting self.question to None to prevent run() to do anything.") self.question = None async def run(self): if not self.question: get_logger().warning('No question provided. Will do nothing.') return None try: # Clone the repository and gather relevant documentation files. docs_filepath_to_contents = self._gen_filenames_to_contents_map_from_repo() #Generate prompt for the AI model. This will be the full text of all the documentation files combined. docs_prompt = aggregate_documentation_files_for_prompt_contents(docs_filepath_to_contents) if not docs_filepath_to_contents or not docs_prompt: get_logger().warning(f"Could not find any usable documentation. Returning with no result...") return None docs_prompt_to_send_to_model = docs_prompt # Estimate how many tokens will be needed. # In case the expected number of tokens exceeds LLM limits, retry with just headings, asking the LLM to rank according to relevance to the question. # Based on returned ranking, rerun but sort the documents accordingly, this time, trim in case of exceeding limit. #First, check if the text is not too long to even query the LLM provider: max_allowed_txt_input = get_maximal_text_input_length_for_token_count_estimation() invoke_llm_just_with_headings = self._trim_docs_input(docs_prompt_to_send_to_model, max_allowed_txt_input, only_return_if_trim_needed=True) if invoke_llm_just_with_headings: #Entire docs is too long. Rank and return according to relevance. docs_prompt_to_send_to_model = await self._rank_docs_and_return_them_as_prompt(docs_filepath_to_contents, max_allowed_txt_input) if not docs_prompt_to_send_to_model: get_logger().error("Failed to generate docs prompt for model. Returning with no result...") return # At this point, either all original documents be used (if their total length doesn't exceed limits), or only those selected. self.vars['snippets'] = docs_prompt_to_send_to_model.strip() # Run the AI model and extract sections from its response response = await retry_with_fallback_models(PredictionPreparator(self.ai_handler, self.vars, get_settings().pr_help_docs_prompts.system, get_settings().pr_help_docs_prompts.user), model_type=ModelType.REGULAR) response_yaml = load_yaml(response) if not response_yaml: get_logger().error("Failed to parse the AI response.", artifacts={'response': response}) return response_str = response_yaml.get('response') relevant_sections = response_yaml.get('relevant_sections') if not response_str or not relevant_sections: get_logger().error("Failed to extract response/relevant sections.", artifacts={'raw_response': response, 'response_str': response_str, 'relevant_sections': relevant_sections}) return if int(response_yaml.get('question_is_relevant', '1')) == 0: get_logger().warning(f"Question is not relevant. Returning without an answer...", artifacts={'raw_response': response}) return # Format the response as markdown answer_str = self._format_model_answer(response_str, relevant_sections) if self.return_as_string: #Skip publishing return answer_str #Otherwise, publish the answer if answer is non empty and publish is not turned off: if answer_str and get_settings().config.publish_output: self.git_provider.publish_comment(answer_str) else: get_logger().info("Answer:", artifacts={'answer_str': answer_str}) return answer_str except Exception as e: get_logger().exception('failed to provide answer to given user question as a result of a thrown exception (see above)') def _find_all_document_files_matching_exts(self, abs_docs_path: str, ignore_readme=False, max_allowed_files=5000) -> list[str]: try: matching_files = [] # Ensure extensions don't have leading dots and are lowercase dotless_extensions = [ext.lower().lstrip('.') for ext in self.supported_doc_exts] # Walk through directory and subdirectories file_cntr = 0 for root, _, files in os.walk(abs_docs_path): for file in files: if ignore_readme and root == abs_docs_path and file.lower() in [f"readme.{ext}" for ext in dotless_extensions]: continue # Check if file has one of the specified extensions if any(file.lower().endswith(f'.{ext}') for ext in dotless_extensions): file_cntr+=1 matching_files.append(os.path.join(root, file)) if file_cntr >= max_allowed_files: get_logger().warning(f"Found at least {max_allowed_files} files in {abs_docs_path}, skipping the rest.") return matching_files return matching_files except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty list.") return [] def _gen_filenames_to_contents_map_from_repo(self) -> dict[str, str]: try: with TemporaryDirectory() as tmp_dir: get_logger().debug(f"About to clone repository: {self.repo_url} to temporary directory: {tmp_dir}...") returned_cloned_repo_root = self.git_provider.clone(self.repo_url, tmp_dir, remove_dest_folder=False) if not returned_cloned_repo_root: raise Exception(f"Failed to clone {self.repo_url} to {tmp_dir}") get_logger().debug(f"About to gather relevant documentation files...") doc_files = [] if self.include_root_readme_file: for root, _, files in os.walk(returned_cloned_repo_root.path): # Only look at files in the root directory, not subdirectories if root == returned_cloned_repo_root.path: for file in files: if file.lower().startswith("readme."): doc_files.append(os.path.join(root, file)) abs_docs_path = os.path.join(returned_cloned_repo_root.path, self.docs_path) if os.path.exists(abs_docs_path): doc_files.extend(self._find_all_document_files_matching_exts(abs_docs_path, ignore_readme=(self.docs_path=='.'))) if not doc_files: get_logger().warning(f"No documentation files found matching file extensions: " f"{self.supported_doc_exts} under repo: {self.repo_url} " f"path: {self.docs_path}. Returning empty dict.") return {} get_logger().info(f'For context {self.ctx_url} and repo: {self.repo_url}' f' will be using the following documentation files: ', artifacts={'doc_files': doc_files}) return map_documentation_files_to_contents(returned_cloned_repo_root.path, doc_files) except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty dict.") return {} def _trim_docs_input(self, docs_input: str, max_allowed_txt_input: int, only_return_if_trim_needed=False) -> bool|str: try: if len(docs_input) >= max_allowed_txt_input: get_logger().warning( f"Text length: {len(docs_input)} exceeds the current returned limit of {max_allowed_txt_input} just for token count estimation. Trimming the text...") if only_return_if_trim_needed: return True docs_input = docs_input[:max_allowed_txt_input] # Then, count the tokens in the prompt. If the count exceeds the limit, trim the text. token_count = self.token_handler.count_tokens(docs_input, force_accurate=True) get_logger().debug(f"Estimated token count of documentation to send to model: {token_count}") model = get_settings().config.model if model in MAX_TOKENS: max_tokens_full = MAX_TOKENS[ model] # note - here we take the actual max tokens, without any reductions. we do aim to get the full documentation website in the prompt else: max_tokens_full = get_max_tokens(model) delta_output = 5000 # Elbow room to reduce chance of exceeding token limit or model paying less attention to prompt guidelines. if token_count > max_tokens_full - delta_output: if only_return_if_trim_needed: return True docs_input = clean_markdown_content( docs_input) # Reduce unnecessary text/images/etc. get_logger().info( f"Token count {token_count} exceeds the limit {max_tokens_full - delta_output}. Attempting to clip text to fit within the limit...") docs_input = clip_tokens(docs_input, max_tokens_full - delta_output, num_input_tokens=token_count) if only_return_if_trim_needed: return False return docs_input except Exception as e: # Unexpected exception. Rethrowing it since: # 1. This is an internal function. # 2. An empty str/False result is a valid one - would require now checking also for None. get_logger().exception(f"Unexpected exception thrown. Rethrowing it...") raise e async def _rank_docs_and_return_them_as_prompt(self, docs_filepath_to_contents: dict[str, str], max_allowed_txt_input: int) -> str: try: #Return just file name and their headings (if exist): docs_prompt_to_send_to_model = ( aggregate_documentation_files_for_prompt_contents(docs_filepath_to_contents, return_just_headings=True)) # Verify list of headings does not exceed limits - trim it if it does. docs_prompt_to_send_to_model = self._trim_docs_input(docs_prompt_to_send_to_model, max_allowed_txt_input, only_return_if_trim_needed=False) if not docs_prompt_to_send_to_model: get_logger().error("_trim_docs_input returned an empty result.") return "" self.vars['snippets'] = docs_prompt_to_send_to_model.strip() # Run the AI model and extract sections from its response response = await retry_with_fallback_models(PredictionPreparator(self.ai_handler, self.vars, get_settings().pr_help_docs_headings_prompts.system, get_settings().pr_help_docs_headings_prompts.user), model_type=ModelType.REGULAR) response_yaml = load_yaml(response) if not response_yaml: get_logger().error("Failed to parse the AI response.", artifacts={'response': response}) return "" # else: Sanitize the output so that the file names match 1:1 dictionary keys. Do this via the file index and not its name, which may be altered by the model. valid_indices = [int(entry['idx']) for entry in response_yaml.get('relevant_files_ranking') if int(entry['idx']) >= 0 and int(entry['idx']) < len(docs_filepath_to_contents)] valid_file_paths = [list(docs_filepath_to_contents.keys())[idx] for idx in valid_indices] selected_docs_dict = {file_path: docs_filepath_to_contents[file_path] for file_path in valid_file_paths} docs_prompt = aggregate_documentation_files_for_prompt_contents(selected_docs_dict) docs_prompt_to_send_to_model = docs_prompt # Check if the updated list of documents does not exceed limits and trim if it does: docs_prompt_to_send_to_model = self._trim_docs_input(docs_prompt_to_send_to_model, max_allowed_txt_input, only_return_if_trim_needed=False) if not docs_prompt_to_send_to_model: get_logger().error("_trim_docs_input returned an empty result.") return "" return docs_prompt_to_send_to_model except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty result.") return "" def _format_model_answer(self, response_str: str, relevant_sections: list[dict[str, str]]) -> str: try: canonical_url_prefix, canonical_url_suffix = ( self.git_provider.get_canonical_url_parts(repo_git_url=self.repo_url if self.repo_url_given_explicitly else None, desired_branch=self.repo_desired_branch)) answer_str = format_markdown_q_and_a_response(self.question, response_str, relevant_sections, self.supported_doc_exts, canonical_url_prefix, canonical_url_suffix) if answer_str: #Remove the question phrase and replace with light bulb and a heading mentioning this is an automated answer: answer_str = modify_answer_section(answer_str) #In case the response should not be published and returned as string, stop here: if answer_str and self.return_as_string: get_logger().info(f"Chat help docs answer", artifacts={'answer_str': answer_str}) return answer_str if not answer_str: get_logger().info(f"No answer found") return "" if self.git_provider.is_supported("gfm_markdown") and get_settings().pr_help_docs.enable_help_text: answer_str += "<hr>\n\n<details> <summary><strong>💡 Tool usage guide:</strong></summary><hr> \n\n" answer_str += HelpMessage.get_help_docs_usage_guide() answer_str += "\n</details>\n" return answer_str except Exception as e: get_logger().exception(f"Unexpected exception thrown. Returning empty result.") return "" ================================================ FILE: pr_agent/tools/pr_help_message.py ================================================ import copy import re from functools import partial from pathlib import Path from jinja2 import Environment, StrictUndefined from pr_agent.algo import MAX_TOKENS from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import ModelType, clip_tokens, load_yaml, get_max_tokens from pr_agent.config_loader import get_settings from pr_agent.git_providers import BitbucketServerProvider, GithubProvider, get_git_provider_with_context from pr_agent.log import get_logger def extract_header(snippet): res = '' lines = snippet.split('===Snippet content===')[0].split('\n') highest_header = '' highest_level = float('inf') for line in lines[::-1]: line = line.strip() if line.startswith('Header '): highest_header = line.split(': ')[1] if highest_header: res = f"#{highest_header.lower().replace(' ', '-')}" return res class PRHelpMessage: def __init__(self, pr_url: str, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler, return_as_string=False): self.git_provider = get_git_provider_with_context(pr_url) self.ai_handler = ai_handler() self.question_str = self.parse_args(args) self.return_as_string = return_as_string if self.question_str: self.vars = { "question": self.question_str, "snippets": "", } self.token_handler = TokenHandler(None, self.vars, get_settings().pr_help_prompts.system, get_settings().pr_help_prompts.user) async def _prepare_prediction(self, model: str): try: variables = copy.deepcopy(self.vars) environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(get_settings().pr_help_prompts.system).render(variables) user_prompt = environment.from_string(get_settings().pr_help_prompts.user).render(variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) return response except Exception as e: get_logger().error(f"Error while preparing prediction: {e}") return "" def parse_args(self, args): if args and len(args) > 0: question_str = " ".join(args) else: question_str = "" return question_str def format_markdown_header(self, header: str) -> str: try: # First, strip common characters from both ends cleaned = header.strip('# 💎\n') # Define all characters to be removed/replaced in a single pass replacements = { "'": '', "`": '', '(': '', ')': '', ',': '', '.': '', '?': '', '!': '', ' ': '-' } # Compile regex pattern for characters to remove pattern = re.compile('|'.join(map(re.escape, replacements.keys()))) # Perform replacements in a single pass and convert to lowercase return pattern.sub(lambda m: replacements[m.group()], cleaned).lower() except Exception: get_logger().exception(f"Error while formatting markdown header", artifacts={'header': header}) return "" async def run(self): try: if self.question_str: get_logger().info(f'Answering a PR question about the PR {self.git_provider.pr_url} ') if not get_settings().get('openai.key'): if get_settings().config.publish_output: self.git_provider.publish_comment( "The `Help` tool chat feature requires an OpenAI API key for calculating embeddings") else: get_logger().error("The `Help` tool chat feature requires an OpenAI API key for calculating embeddings") return # current path docs_path= Path(__file__).parent.parent.parent / 'docs' / 'docs' # get all the 'md' files inside docs_path and its subdirectories md_files = list(docs_path.glob('**/*.md')) folders_to_exclude = ['/finetuning_benchmark/'] files_to_exclude = {'EXAMPLE_BEST_PRACTICE.md', 'compression_strategy.md', '/docs/overview/index.md'} md_files = [file for file in md_files if not any(folder in str(file) for folder in folders_to_exclude) and not any(file.name == file_to_exclude for file_to_exclude in files_to_exclude)] # sort the 'md_files' so that 'priority_files' will be at the top priority_files_strings = ['/docs/index.md', '/usage-guide', 'tools/describe.md', 'tools/review.md', 'tools/improve.md', '/faq'] md_files_priority = [file for file in md_files if any(priority_string in str(file) for priority_string in priority_files_strings)] md_files_not_priority = [file for file in md_files if file not in md_files_priority] md_files = md_files_priority + md_files_not_priority docs_prompt = "" for file in md_files: try: with open(file, 'r') as f: file_path = str(file).replace(str(docs_path), '') docs_prompt += f"\n==file name==\n\n{file_path}\n\n==file content==\n\n{f.read().strip()}\n=========\n\n" except Exception as e: get_logger().error(f"Error while reading the file {file}: {e}") token_count = self.token_handler.count_tokens(docs_prompt) get_logger().debug(f"Token count of full documentation website: {token_count}") model = get_settings().config.model if model in MAX_TOKENS: max_tokens_full = MAX_TOKENS[model] # note - here we take the actual max tokens, without any reductions. we do aim to get the full documentation website in the prompt else: max_tokens_full = get_max_tokens(model) delta_output = 2000 if token_count > max_tokens_full - delta_output: get_logger().info(f"Token count {token_count} exceeds the limit {max_tokens_full - delta_output}. Skipping the PR Help message.") docs_prompt = clip_tokens(docs_prompt, max_tokens_full - delta_output) self.vars['snippets'] = docs_prompt.strip() # run the AI model response = await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.REGULAR) response_yaml = load_yaml(response) if isinstance(response_yaml, str): get_logger().warning(f"failing to parse response: {response_yaml}, publishing the response as is") if get_settings().config.publish_output: answer_str = f"### Question: \n{self.question_str}\n\n" answer_str += f"### Answer:\n\n" answer_str += response_yaml self.git_provider.publish_comment(answer_str) return "" response_str = response_yaml.get('response') relevant_sections = response_yaml.get('relevant_sections') if not relevant_sections: get_logger().info(f"Could not find relevant answer for the question: {self.question_str}") if get_settings().config.publish_output: answer_str = f"### Question: \n{self.question_str}\n\n" answer_str += f"### Answer:\n\n" answer_str += f"Could not find relevant information to answer the question. Please provide more details and try again." self.git_provider.publish_comment(answer_str) return "" # prepare the answer answer_str = "" if response_str: answer_str += f"### Question: \n{self.question_str}\n\n" answer_str += f"### Answer:\n{response_str.strip()}\n\n" answer_str += f"#### Relevant Sources:\n\n" base_path = "https://qodo-merge-docs.qodo.ai/" for section in relevant_sections: file = section.get('file_name').strip().removesuffix('.md') if str(section['relevant_section_header_string']).strip(): markdown_header = self.format_markdown_header(section['relevant_section_header_string']) answer_str += f"> - {base_path}{file}#{markdown_header}\n" else: answer_str += f"> - {base_path}{file}\n" # publish the answer if get_settings().config.publish_output: self.git_provider.publish_comment(answer_str) else: get_logger().info(f"Answer:\n{answer_str}") else: if not isinstance(self.git_provider, BitbucketServerProvider) and not self.git_provider.is_supported("gfm_markdown"): self.git_provider.publish_comment( "The `Help` tool requires gfm markdown, which is not supported by your code platform.") return get_logger().info('Getting PR Help Message...') relevant_configs = {'pr_help': dict(get_settings().pr_help), 'config': dict(get_settings().config)} get_logger().debug("Relevant configs", artifacts=relevant_configs) pr_comment = "## PR Agent Walkthrough 🤖\n\n" pr_comment += "Welcome to the PR Agent, an AI-powered tool for automated pull request analysis, feedback, suggestions and more.""" pr_comment += "\n\nHere is a list of tools you can use to interact with the PR Agent:\n" base_path = "https://pr-agent-docs.codium.ai/tools" tool_names = [] tool_names.append(f"[DESCRIBE]({base_path}/describe/)") tool_names.append(f"[REVIEW]({base_path}/review/)") tool_names.append(f"[IMPROVE]({base_path}/improve/)") tool_names.append(f"[UPDATE CHANGELOG]({base_path}/update_changelog/)") tool_names.append(f"[HELP DOCS]({base_path}/help_docs/)") tool_names.append(f"[ADD DOCS]({base_path}/add_docs/)") tool_names.append(f"[ASK]({base_path}/ask/)") tool_names.append(f"[GENERATE CUSTOM LABELS]({base_path}/generate_labels/)") descriptions = [] descriptions.append("Generates PR description - title, type, summary, code walkthrough and labels") descriptions.append("Adjustable feedback about the PR, possible issues, security concerns, review effort and more") descriptions.append("Code suggestions for improving the PR") descriptions.append("Automatically updates the changelog") descriptions.append("Answers a question regarding this repository, or a given one, based on given documentation path") descriptions.append("Generates documentation to methods/functions/classes that changed in the PR") descriptions.append("Answering free-text questions about the PR") descriptions.append("Generates custom labels for the PR, based on specific guidelines defined by the user") commands =[] commands.append("`/describe`") commands.append("`/review`") commands.append("`/improve`") commands.append("`/update_changelog`") commands.append("`/help_docs`") commands.append("`/add_docs`") commands.append("`/ask`") commands.append("`/generate_labels`") checkbox_list = [] checkbox_list.append(" - [ ] Run <!-- /describe -->") checkbox_list.append(" - [ ] Run <!-- /review -->") checkbox_list.append(" - [ ] Run <!-- /improve -->") checkbox_list.append(" - [ ] Run <!-- /update_changelog -->") checkbox_list.append(" - [ ] Run <!-- /help_docs -->") checkbox_list.append(" - [ ] Run <!-- /add_docs -->") checkbox_list.append("[*]") checkbox_list.append("[*]") checkbox_list.append("[*]") checkbox_list.append("[*]") checkbox_list.append("[*]") if isinstance(self.git_provider, GithubProvider) and not get_settings().config.get('disable_checkboxes', False): pr_comment += f"<table><tr align='left'><th align='left'>Tool</th><th align='left'>Description</th><th align='left'>Trigger Interactively :gem:</th></tr>" for i in range(len(tool_names)): pr_comment += f"\n<tr><td align='left'>\n\n<strong>{tool_names[i]}</strong></td>\n<td>{descriptions[i]}</td>\n<td>\n\n{checkbox_list[i]}\n</td></tr>" pr_comment += "</table>\n\n" pr_comment += f"""\n\n(1) Note that each tool can be [triggered automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#github-app-automatic-tools-when-a-new-pr-is-opened) when a new PR is opened, or called manually by [commenting on a PR](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#online-usage).""" pr_comment += f"""\n\n(2) Tools marked with [*] require additional parameters to be passed. For example, to invoke the `/ask` tool, you need to comment on a PR: `/ask "<question content>"`. See the relevant documentation for each tool for more details.""" elif isinstance(self.git_provider, BitbucketServerProvider): # only support basic commands in BBDC pr_comment = generate_bbdc_table(tool_names[:4], descriptions[:4]) else: pr_comment += f"<table><tr align='left'><th align='left'>Tool</th><th align='left'>Command</th><th align='left'>Description</th></tr>" for i in range(len(tool_names)): pr_comment += f"\n<tr><td align='left'>\n\n<strong>{tool_names[i]}</strong></td><td>{commands[i]}</td><td>{descriptions[i]}</td></tr>" pr_comment += "</table>\n\n" pr_comment += f"""\n\nNote that each tool can be [invoked automatically](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/) when a new PR is opened, or called manually by [commenting on a PR](https://pr-agent-docs.codium.ai/usage-guide/automations_and_usage/#online-usage).""" if get_settings().config.publish_output: self.git_provider.publish_comment(pr_comment) except Exception as e: get_logger().exception(f"Error while running PRHelpMessage: {e}") return "" async def prepare_relevant_snippets(self, sim_results): # Get relevant snippets relevant_snippets_full = [] relevant_pages_full = [] relevant_snippets_full_header = [] th = 0.75 for s in sim_results: page = s[0].metadata['source'] content = s[0].page_content score = s[1] relevant_snippets_full.append(content) relevant_snippets_full_header.append(extract_header(content)) relevant_pages_full.append(page) # build the snippets string relevant_snippets_str = "" for i, s in enumerate(relevant_snippets_full): relevant_snippets_str += f"Snippet {i+1}:\n\n{s}\n\n" relevant_snippets_str += "-------------------\n\n" return relevant_pages_full, relevant_snippets_full_header, relevant_snippets_str def generate_bbdc_table(column_arr_1, column_arr_2): # Generating header row header_row = "| Tool | Description | \n" # Generating separator row separator_row = "|--|--|\n" # Generating data rows data_rows = "" max_len = max(len(column_arr_1), len(column_arr_2)) for i in range(max_len): col1 = column_arr_1[i] if i < len(column_arr_1) else "" col2 = column_arr_2[i] if i < len(column_arr_2) else "" data_rows += f"| {col1} | {col2} |\n" # Combine all parts to form the complete table markdown_table = header_row + separator_row + data_rows return markdown_table ================================================ FILE: pr_agent/tools/pr_line_questions.py ================================================ import argparse import copy from functools import partial from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.git_patch_processing import ( decouple_and_convert_to_hunks_with_lines_numbers, extract_hunk_lines_from_patch) from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import ModelType from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.git_providers.github_provider import GithubProvider from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage class PR_LineQuestions: def __init__(self, pr_url: str, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): self.question_str = self.parse_args(args) self.git_provider = get_git_provider()(pr_url) self.main_pr_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.ai_handler = ai_handler() self.ai_handler.main_pr_language = self.main_pr_language self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), "diff": "", # empty diff for initial calculation "question": self.question_str, "full_hunk": "", "selected_lines": "", "conversation_history": "", } self.token_handler = TokenHandler(self.git_provider.pr, self.vars, get_settings().pr_line_questions_prompt.system, get_settings().pr_line_questions_prompt.user) self.patches_diff = None self.prediction = None def parse_args(self, args): if args and len(args) > 0: question_str = " ".join(args) else: question_str = "" return question_str async def run(self): get_logger().info('Answering a PR lines question...') # if get_settings().config.publish_output: # self.git_provider.publish_comment("Preparing answer...", is_temporary=True) # set conversation history if enabled # currently only supports GitHub provider if get_settings().pr_questions.use_conversation_history and isinstance(self.git_provider, GithubProvider): conversation_history = self._load_conversation_history() self.vars["conversation_history"] = conversation_history self.patch_with_lines = "" ask_diff = get_settings().get('ask_diff_hunk', "") line_start = get_settings().get('line_start', '') line_end = get_settings().get('line_end', '') side = get_settings().get('side', 'RIGHT') file_name = get_settings().get('file_name', '') comment_id = get_settings().get('comment_id', '') if ask_diff: self.patch_with_lines, self.selected_lines = extract_hunk_lines_from_patch(ask_diff, file_name, line_start=line_start, line_end=line_end, side=side ) else: diff_files = self.git_provider.get_diff_files() for file in diff_files: if file.filename == file_name: self.patch_with_lines, self.selected_lines = extract_hunk_lines_from_patch(file.patch, file.filename, line_start=line_start, line_end=line_end, side=side) if self.patch_with_lines: model_answer = await retry_with_fallback_models(self._get_prediction, model_type=ModelType.WEAK) # sanitize the answer so that no line will start with "/" model_answer_sanitized = model_answer.strip().replace("\n/", "\n /") if model_answer_sanitized.startswith("/"): model_answer_sanitized = " " + model_answer_sanitized get_logger().info('Preparing answer...') if comment_id: self.git_provider.reply_to_comment_from_comment_id(comment_id, model_answer_sanitized) else: self.git_provider.publish_comment(model_answer_sanitized) return "" def _load_conversation_history(self) -> str: """Generate conversation history from the code review thread Returns: str: The formatted conversation history """ comment_id = get_settings().get('comment_id', '') file_path = get_settings().get('file_name', '') line_number = get_settings().get('line_end', '') # early return if any required parameter is missing if not all([comment_id, file_path, line_number]): get_logger().error("Missing required parameters for conversation history") return "" try: # retrieve thread comments thread_comments = self.git_provider.get_review_thread_comments(comment_id) # filter and prepare comments filtered_comments = [] for comment in thread_comments: body = getattr(comment, 'body', '') # skip empty comments, current comment(will be added as a question at prompt) if not body or not body.strip() or comment_id == comment.id: continue user = comment.user author = user.login if hasattr(user, 'login') else 'Unknown' filtered_comments.append((author, body)) # transform conversation history to string using the same pattern as get_commit_messages if filtered_comments: comment_count = len(filtered_comments) get_logger().info(f"Loaded {comment_count} comments from the code review thread") # Format as numbered list, similar to get_commit_messages conversation_history_str = "\n".join([f"{i + 1}. {author}: {body}" for i, (author, body) in enumerate(filtered_comments)]) return conversation_history_str return "" except Exception as e: get_logger().error(f"Error processing conversation history, error: {e}") return "" async def _get_prediction(self, model: str): variables = copy.deepcopy(self.vars) variables["full_hunk"] = self.patch_with_lines # update diff variables["selected_lines"] = self.selected_lines environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(get_settings().pr_line_questions_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_line_questions_prompt.user).render(variables) if get_settings().config.verbosity_level >= 2: # get_logger().info(f"\nSystem prompt:\n{system_prompt}") # get_logger().info(f"\nUser prompt:\n{user_prompt}") print(f"\nSystem prompt:\n{system_prompt}") print(f"\nUser prompt:\n{user_prompt}") response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) return response ================================================ FILE: pr_agent/tools/pr_questions.py ================================================ import copy from functools import partial from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import ModelType from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider, GitLabProvider from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage class PRQuestions: def __init__(self, pr_url: str, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): question_str = self.parse_args(args) self.pr_url = pr_url self.git_provider = get_git_provider()(pr_url) self.main_pr_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.ai_handler = ai_handler() self.ai_handler.main_pr_language = self.main_pr_language self.question_str = question_str self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), "description": self.git_provider.get_pr_description(), "language": self.main_pr_language, "diff": "", # empty diff for initial calculation "questions": self.question_str, "commit_messages_str": self.git_provider.get_commit_messages(), } self.token_handler = TokenHandler(self.git_provider.pr, self.vars, get_settings().pr_questions_prompt.system, get_settings().pr_questions_prompt.user) self.patches_diff = None self.prediction = None def parse_args(self, args): if args and len(args) > 0: question_str = " ".join(args) else: question_str = "" return question_str async def run(self): get_logger().info(f'Answering a PR question about the PR {self.pr_url} ') relevant_configs = {'pr_questions': dict(get_settings().pr_questions), 'config': dict(get_settings().config)} get_logger().debug("Relevant configs", artifacts=relevant_configs) if get_settings().config.publish_output: self.git_provider.publish_comment("Preparing answer...", is_temporary=True) # identify image img_path = self.identify_image_in_comment() if img_path: get_logger().debug(f"Image path identified", artifact=img_path) await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.WEAK) pr_comment = self._prepare_pr_answer() get_logger().debug(f"PR output", artifact=pr_comment) if self.git_provider.is_supported("gfm_markdown") and get_settings().pr_questions.enable_help_text: pr_comment += "<hr>\n\n<details> <summary><strong>💡 Tool usage guide:</strong></summary><hr> \n\n" pr_comment += HelpMessage.get_ask_usage_guide() pr_comment += "\n</details>\n" if get_settings().config.publish_output: self.git_provider.publish_comment(pr_comment) self.git_provider.remove_initial_comment() return "" def identify_image_in_comment(self): img_path = '' if '![image]' in self.question_str: # assuming structure: # /ask question ... > ![image](img_path) img_path = self.question_str.split('![image]')[1].strip().strip('()') self.vars['img_path'] = img_path elif 'https://' in self.question_str and ('.png' in self.question_str or 'jpg' in self.question_str): # direct image link # include https:// in the image path img_path = 'https://' + self.question_str.split('https://')[1] self.vars['img_path'] = img_path return img_path async def _prepare_prediction(self, model: str): self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) if self.patches_diff: get_logger().debug(f"PR diff", artifact=self.patches_diff) self.prediction = await self._get_prediction(model) else: get_logger().error(f"Error getting PR diff") self.prediction = "" async def _get_prediction(self, model: str): variables = copy.deepcopy(self.vars) variables["diff"] = self.patches_diff # update diff environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(get_settings().pr_questions_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_questions_prompt.user).render(variables) if 'img_path' in variables: img_path = self.vars['img_path'] response, finish_reason = await (self.ai_handler.chat_completion (model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt, img_path=img_path)) else: response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt) return response def gitlab_protections(self, model_answer: str) -> str: github_quick_actions_MR = ["/approve", "/close", "/merge", "/reopen", "/unapprove", "/title", "/assign", "/copy_metadata", "/target_branch"] if any(action in model_answer for action in github_quick_actions_MR): str_err = "Model answer contains GitHub quick actions, which are not supported in GitLab" get_logger().error(str_err) return str_err return model_answer def _prepare_pr_answer(self) -> str: model_answer = self.prediction.strip() # sanitize the answer so that no line will start with "/" model_answer_sanitized = model_answer.replace("\n/", "\n /") model_answer_sanitized = model_answer_sanitized.replace("\r/", "\r /") if isinstance(self.git_provider, GitLabProvider): model_answer_sanitized = self.gitlab_protections(model_answer_sanitized) if model_answer_sanitized.startswith("/"): model_answer_sanitized = " " + model_answer_sanitized if model_answer_sanitized != model_answer: get_logger().debug(f"Sanitized model answer", artifact={"model_answer": model_answer, "sanitized_answer": model_answer_sanitized}) answer_str = f"### **Ask**❓\n{self.question_str}\n\n" answer_str += f"### **Answer:**\n{model_answer_sanitized}\n\n" return answer_str ================================================ FILE: pr_agent/tools/pr_reviewer.py ================================================ import copy import datetime import traceback from collections import OrderedDict from functools import partial from typing import List, Tuple from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import (add_ai_metadata_to_diff_files, get_pr_diff, retry_with_fallback_models) from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import (ModelType, PRReviewHeader, convert_to_markdown_v2, github_action_output, load_yaml, show_relevant_configurations) from pr_agent.config_loader import get_settings from pr_agent.git_providers import (get_git_provider, get_git_provider_with_context) from pr_agent.git_providers.git_provider import (IncrementalPR, get_main_pr_language) from pr_agent.log import get_logger from pr_agent.servers.help import HelpMessage from pr_agent.tools.ticket_pr_compliance_check import ( extract_and_cache_pr_tickets, extract_tickets) class PRReviewer: """ The PRReviewer class is responsible for reviewing a pull request and generating feedback using an AI model. """ def __init__(self, pr_url: str, is_answer: bool = False, is_auto: bool = False, args: list = None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): """ Initialize the PRReviewer object with the necessary attributes and objects to review a pull request. Args: pr_url (str): The URL of the pull request to be reviewed. is_answer (bool, optional): Indicates whether the review is being done in answer mode. Defaults to False. is_auto (bool, optional): Indicates whether the review is being done in automatic mode. Defaults to False. ai_handler (BaseAiHandler): The AI handler to be used for the review. Defaults to None. args (list, optional): List of arguments passed to the PRReviewer class. Defaults to None. """ self.git_provider = get_git_provider_with_context(pr_url) self.args = args self.incremental = self.parse_incremental(args) # -i command if self.incremental and self.incremental.is_incremental: self.git_provider.get_incremental_commits(self.incremental) self.main_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.pr_url = pr_url self.is_answer = is_answer self.is_auto = is_auto if self.is_answer and not self.git_provider.is_supported("get_issue_comments"): raise Exception(f"Answer mode is not supported for {get_settings().config.git_provider} for now") self.ai_handler = ai_handler() self.ai_handler.main_pr_language = self.main_language self.patches_diff = None self.prediction = None answer_str, question_str = self._get_user_answers() self.pr_description, self.pr_description_files = ( self.git_provider.get_pr_description(split_changes_walkthrough=True)) if (self.pr_description_files and get_settings().get("config.is_auto_command", False) and get_settings().get("config.enable_ai_metadata", False)): add_ai_metadata_to_diff_files(self.git_provider, self.pr_description_files) get_logger().debug(f"AI metadata added to the this command") else: get_settings().set("config.enable_ai_metadata", False) get_logger().debug(f"AI metadata is disabled for this command") self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), "description": self.pr_description, "language": self.main_language, "diff": "", # empty diff for initial calculation "num_pr_files": self.git_provider.get_num_of_files(), "num_max_findings": get_settings().pr_reviewer.num_max_findings, "require_score": get_settings().pr_reviewer.require_score_review, "require_tests": get_settings().pr_reviewer.require_tests_review, "require_estimate_effort_to_review": get_settings().pr_reviewer.require_estimate_effort_to_review, "require_estimate_contribution_time_cost": get_settings().pr_reviewer.require_estimate_contribution_time_cost, 'require_can_be_split_review': get_settings().pr_reviewer.require_can_be_split_review, 'require_security_review': get_settings().pr_reviewer.require_security_review, 'require_todo_scan': get_settings().pr_reviewer.get("require_todo_scan", False), 'question_str': question_str, 'answer_str': answer_str, "extra_instructions": get_settings().pr_reviewer.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), "custom_labels": "", "enable_custom_labels": get_settings().config.enable_custom_labels, "is_ai_metadata": get_settings().get("config.enable_ai_metadata", False), "related_tickets": get_settings().get('related_tickets', []), 'duplicate_prompt_examples': get_settings().config.get('duplicate_prompt_examples', False), "date": datetime.datetime.now().strftime('%Y-%m-%d'), } self.token_handler = TokenHandler( self.git_provider.pr, self.vars, get_settings().pr_review_prompt.system, get_settings().pr_review_prompt.user ) def parse_incremental(self, args: List[str]): is_incremental = False if args and len(args) >= 1: arg = args[0] if arg == "-i": is_incremental = True incremental = IncrementalPR(is_incremental) return incremental async def run(self) -> None: try: if not self.git_provider.get_files(): get_logger().info(f"PR has no files: {self.pr_url}, skipping review") return None if self.incremental.is_incremental and not self._can_run_incremental_review(): return None # if isinstance(self.args, list) and self.args and self.args[0] == 'auto_approve': # get_logger().info(f'Auto approve flow PR: {self.pr_url} ...') # self.auto_approve_logic() # return None get_logger().info(f'Reviewing PR: {self.pr_url} ...') relevant_configs = {'pr_reviewer': dict(get_settings().pr_reviewer), 'config': dict(get_settings().config)} get_logger().debug("Relevant configs", artifacts=relevant_configs) # ticket extraction if exists await extract_and_cache_pr_tickets(self.git_provider, self.vars) if self.incremental.is_incremental and hasattr(self.git_provider, "unreviewed_files_set") and not self.git_provider.unreviewed_files_set: get_logger().info(f"Incremental review is enabled for {self.pr_url} but there are no new files") previous_review_url = "" if hasattr(self.git_provider, "previous_review"): previous_review_url = self.git_provider.previous_review.html_url if get_settings().config.publish_output: self.git_provider.publish_comment(f"Incremental Review Skipped\n" f"No files were changed since the [previous PR Review]({previous_review_url})") return None if get_settings().config.publish_output and not get_settings().config.get('is_auto_command', False): self.git_provider.publish_comment("Preparing review...", is_temporary=True) await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.REGULAR) if not self.prediction: self.git_provider.remove_initial_comment() return None pr_review = self._prepare_pr_review() get_logger().debug(f"PR output", artifact=pr_review) should_publish = get_settings().config.publish_output and self._should_publish_review_no_suggestions(pr_review) if not should_publish: reason = "Review output is not published" if get_settings().config.publish_output: reason += ": no major issues detected." get_logger().info(reason) get_settings().data = {"artifact": pr_review} return # publish the review if get_settings().pr_reviewer.persistent_comment and not self.incremental.is_incremental: final_update_message = get_settings().pr_reviewer.final_update_message self.git_provider.publish_persistent_comment(pr_review, initial_header=f"{PRReviewHeader.REGULAR.value} 🔍", update_header=True, final_update_message=final_update_message, ) else: self.git_provider.publish_comment(pr_review) self.git_provider.remove_initial_comment() except Exception as e: get_logger().error(f"Failed to review PR: {e}") def _should_publish_review_no_suggestions(self, pr_review: str) -> bool: return get_settings().pr_reviewer.get('publish_output_no_suggestions', True) or "No major issues detected" not in pr_review async def _prepare_prediction(self, model: str) -> None: self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model, add_line_numbers_to_hunks=True, disable_extra_lines=False,) if self.patches_diff: get_logger().debug(f"PR diff", diff=self.patches_diff) self.prediction = await self._get_prediction(model) else: get_logger().warning(f"Empty diff for PR: {self.pr_url}") self.prediction = None async def _get_prediction(self, model: str) -> str: """ Generate an AI prediction for the pull request review. Args: model: A string representing the AI model to be used for the prediction. Returns: A string representing the AI prediction for the pull request review. """ variables = copy.deepcopy(self.vars) variables["diff"] = self.patches_diff # update diff environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(get_settings().pr_review_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_review_prompt.user).render(variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, temperature=get_settings().config.temperature, system=system_prompt, user=user_prompt ) return response def _prepare_pr_review(self) -> str: """ Prepare the PR review by processing the AI prediction and generating a markdown-formatted text that summarizes the feedback. """ first_key = 'review' last_key = 'security_concerns' data = load_yaml(self.prediction.strip(), keys_fix_yaml=["ticket_compliance_check", "estimated_effort_to_review_[1-5]:", "security_concerns:", "key_issues_to_review:", "relevant_file:", "relevant_line:", "suggestion:"], first_key=first_key, last_key=last_key) github_action_output(data, 'review') if 'review' not in data: get_logger().exception("Failed to parse review data", artifact={"data": data}) return "" # move data['review'] 'key_issues_to_review' key to the end of the dictionary if 'key_issues_to_review' in data['review']: key_issues_to_review = data['review'].pop('key_issues_to_review') data['review']['key_issues_to_review'] = key_issues_to_review incremental_review_markdown_text = None # Add incremental review section if self.incremental.is_incremental: last_commit_url = f"{self.git_provider.get_pr_url()}/commits/" \ f"{self.git_provider.incremental.first_new_commit_sha}" incremental_review_markdown_text = f"Starting from commit {last_commit_url}" markdown_text = convert_to_markdown_v2(data, self.git_provider.is_supported("gfm_markdown"), incremental_review_markdown_text, git_provider=self.git_provider, files=self.git_provider.get_diff_files()) # Add help text if gfm_markdown is supported if self.git_provider.is_supported("gfm_markdown") and get_settings().pr_reviewer.enable_help_text: markdown_text += "<hr>\n\n<details> <summary><strong>💡 Tool usage guide:</strong></summary><hr> \n\n" markdown_text += HelpMessage.get_review_usage_guide() markdown_text += "\n</details>\n" # Output the relevant configurations if enabled if get_settings().get('config', {}).get('output_relevant_configurations', False): markdown_text += show_relevant_configurations(relevant_section='pr_reviewer') # Add custom labels from the review prediction (effort, security) self.set_review_labels(data) if markdown_text == None or len(markdown_text) == 0: markdown_text = "" return markdown_text def _get_user_answers(self) -> Tuple[str, str]: """ Retrieves the question and answer strings from the discussion messages related to a pull request. Returns: A tuple containing the question and answer strings. """ question_str = "" answer_str = "" if self.is_answer: discussion_messages = self.git_provider.get_issue_comments() for message in discussion_messages.reversed: if "Questions to better understand the PR:" in message.body: question_str = message.body elif '/answer' in message.body: answer_str = message.body if answer_str and question_str: break return question_str, answer_str def _get_previous_review_comment(self): """ Get the previous review comment if it exists. """ try: if hasattr(self.git_provider, "get_previous_review"): return self.git_provider.get_previous_review( full=not self.incremental.is_incremental, incremental=self.incremental.is_incremental, ) except Exception as e: get_logger().exception(f"Failed to get previous review comment, error: {e}") def _remove_previous_review_comment(self, comment): """ Remove the previous review comment if it exists. """ try: if comment: self.git_provider.remove_comment(comment) except Exception as e: get_logger().exception(f"Failed to remove previous review comment, error: {e}") def _can_run_incremental_review(self) -> bool: """ Checks if we can run incremental review according the various configurations and previous review. """ # checking if running is auto mode but there are no new commits if self.is_auto and not self.incremental.first_new_commit_sha: get_logger().info(f"Incremental review is enabled for {self.pr_url} but there are no new commits") return False if not hasattr(self.git_provider, "get_incremental_commits"): get_logger().info(f"Incremental review is not supported for {get_settings().config.git_provider}") return False # checking if there are enough commits to start the review num_new_commits = len(self.incremental.commits_range) num_commits_threshold = get_settings().pr_reviewer.minimal_commits_for_incremental_review not_enough_commits = num_new_commits < num_commits_threshold # checking if the commits are not too recent to start the review recent_commits_threshold = datetime.datetime.now() - datetime.timedelta( minutes=get_settings().pr_reviewer.minimal_minutes_for_incremental_review ) last_seen_commit_date = ( self.incremental.last_seen_commit.commit.author.date if self.incremental.last_seen_commit else None ) all_commits_too_recent = ( last_seen_commit_date > recent_commits_threshold if self.incremental.last_seen_commit else False ) # check all the thresholds or just one to start the review condition = any if get_settings().pr_reviewer.require_all_thresholds_for_incremental_review else all if condition((not_enough_commits, all_commits_too_recent)): get_logger().info( f"Incremental review is enabled for {self.pr_url} but didn't pass the threshold check to run:" f"\n* Number of new commits = {num_new_commits} (threshold is {num_commits_threshold})" f"\n* Last seen commit date = {last_seen_commit_date} (threshold is {recent_commits_threshold})" ) return False return True def set_review_labels(self, data): if not get_settings().config.publish_output: return if not get_settings().pr_reviewer.require_estimate_effort_to_review: get_settings().pr_reviewer.enable_review_labels_effort = False # we did not generate this output if not get_settings().pr_reviewer.require_security_review: get_settings().pr_reviewer.enable_review_labels_security = False # we did not generate this output if (get_settings().pr_reviewer.enable_review_labels_security or get_settings().pr_reviewer.enable_review_labels_effort): try: review_labels = [] if get_settings().pr_reviewer.enable_review_labels_effort: estimated_effort = data['review']['estimated_effort_to_review_[1-5]'] estimated_effort_number = 0 if isinstance(estimated_effort, str): try: estimated_effort_number = int(estimated_effort.split(',')[0]) except ValueError: get_logger().warning(f"Invalid estimated_effort value: {estimated_effort}") elif isinstance(estimated_effort, int): estimated_effort_number = estimated_effort else: get_logger().warning(f"Unexpected type for estimated_effort: {type(estimated_effort)}") if 1 <= estimated_effort_number <= 5: # 1, because ... review_labels.append(f'Review effort {estimated_effort_number}/5') if get_settings().pr_reviewer.enable_review_labels_security and get_settings().pr_reviewer.require_security_review: security_concerns = data['review']['security_concerns'] # yes, because ... security_concerns_bool = 'yes' in security_concerns.lower() or 'true' in security_concerns.lower() if security_concerns_bool: review_labels.append('Possible security concern') current_labels = self.git_provider.get_pr_labels(update=True) if not current_labels: current_labels = [] get_logger().debug(f"Current labels:\n{current_labels}") if current_labels: current_labels_filtered = [label for label in current_labels if not label.lower().startswith('review effort') and not label.lower().startswith( 'possible security concern')] else: current_labels_filtered = [] new_labels = review_labels + current_labels_filtered if (current_labels or review_labels) and sorted(new_labels) != sorted(current_labels): get_logger().info(f"Setting review labels:\n{review_labels + current_labels_filtered}") self.git_provider.publish_labels(new_labels) else: get_logger().info(f"Review labels are already set:\n{review_labels + current_labels_filtered}") except Exception as e: get_logger().error(f"Failed to set review labels, error: {e}") def auto_approve_logic(self): """ Auto-approve a pull request if it meets the conditions for auto-approval. """ if get_settings().config.enable_auto_approval: is_auto_approved = self.git_provider.auto_approve() if is_auto_approved: get_logger().info("Auto-approved PR") self.git_provider.publish_comment("Auto-approved PR") else: get_logger().info("Auto-approval option is disabled") self.git_provider.publish_comment("Auto-approval option for PR-Agent is disabled. " "You can enable it via a [configuration file](https://github.com/Codium-ai/pr-agent/blob/main/docs/REVIEW.md#auto-approval-1)") ================================================ FILE: pr_agent/tools/pr_similar_issue.py ================================================ import time from enum import Enum from typing import List import openai from pydantic import BaseModel, Field from pr_agent.algo import MAX_TOKENS from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import get_max_tokens from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.log import get_logger MODEL = "text-embedding-ada-002" class PRSimilarIssue: def __init__(self, issue_url: str, ai_handler, args: list = None): self.issue_url = issue_url self.supported = get_settings().config.git_provider == "github" if not self.supported: return self.cli_mode = get_settings().CONFIG.CLI_MODE self.max_issues_to_scan = get_settings().pr_similar_issue.max_issues_to_scan self.git_provider = get_git_provider()() repo_name, issue_number = self.git_provider._parse_issue_url(issue_url.split('=')[-1]) self.git_provider.repo = repo_name self.git_provider.repo_obj = self.git_provider.github_client.get_repo(repo_name) self.token_handler = TokenHandler() repo_obj = self.git_provider.repo_obj repo_name_for_index = self.repo_name_for_index = repo_obj.full_name.lower().replace('/', '-').replace('_/', '-') index_name = self.index_name = "codium-ai-pr-agent-issues" if get_settings().pr_similar_issue.vectordb == "pinecone": try: import pandas as pd import pinecone from pinecone_datasets import Dataset, DatasetMetadata except: raise Exception("Please install 'pinecone' and 'pinecone_datasets' to use pinecone as vectordb") # assuming pinecone api key and environment are set in secrets file try: api_key = get_settings().pinecone.api_key environment = get_settings().pinecone.environment except Exception: if not self.cli_mode: repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) issue_main.create_comment("Please set pinecone api key and environment in secrets file") raise Exception("Please set pinecone api key and environment in secrets file") # check if index exists, and if repo is already indexed run_from_scratch = False if run_from_scratch: # for debugging pinecone.init(api_key=api_key, environment=environment) if index_name in pinecone.list_indexes(): get_logger().info('Removing index...') pinecone.delete_index(index_name) get_logger().info('Done') upsert = True pinecone.init(api_key=api_key, environment=environment) if not index_name in pinecone.list_indexes(): run_from_scratch = True upsert = False else: if get_settings().pr_similar_issue.force_update_dataset: upsert = True else: pinecone_index = pinecone.Index(index_name=index_name) res = pinecone_index.fetch([f"example_issue_{repo_name_for_index}"]).to_dict() if res["vectors"]: upsert = False if run_from_scratch or upsert: # index the entire repo get_logger().info('Indexing the entire repo...') get_logger().info('Getting issues...') issues = list(repo_obj.get_issues(state='all')) get_logger().info('Done') self._update_index_with_issues(issues, repo_name_for_index, upsert=upsert) else: # update index if needed pinecone_index = pinecone.Index(index_name=index_name) issues_to_update = [] issues_paginated_list = repo_obj.get_issues(state='all') counter = 1 for issue in issues_paginated_list: if issue.pull_request: continue issue_str, comments, number = self._process_issue(issue) issue_key = f"issue_{number}" id = issue_key + "." + "issue" res = pinecone_index.fetch([id]).to_dict() is_new_issue = True for vector in res["vectors"].values(): if vector['metadata']['repo'] == repo_name_for_index: is_new_issue = False break if is_new_issue: counter += 1 issues_to_update.append(issue) else: break if issues_to_update: get_logger().info(f'Updating index with {counter} new issues...') self._update_index_with_issues(issues_to_update, repo_name_for_index, upsert=True) else: get_logger().info('No new issues to update') elif get_settings().pr_similar_issue.vectordb == "lancedb": try: import lancedb # import lancedb only if needed except: raise Exception("Please install lancedb to use lancedb as vectordb") self.db = lancedb.connect(get_settings().lancedb.uri) self.table = None run_from_scratch = False if run_from_scratch: # for debugging if index_name in self.db.table_names(): get_logger().info('Removing Table...') self.db.drop_table(index_name) get_logger().info('Done') ingest = True if index_name not in self.db.table_names(): run_from_scratch = True ingest = False else: if get_settings().pr_similar_issue.force_update_dataset: ingest = True else: self.table = self.db[index_name] res = self.table.search().limit(len(self.table)).where(f"id='example_issue_{repo_name_for_index}'").to_list() get_logger().info("result: ", res) if res[0].get("vector"): ingest = False if run_from_scratch or ingest: # indexing the entire repo get_logger().info('Indexing the entire repo...') get_logger().info('Getting issues...') issues = list(repo_obj.get_issues(state='all')) get_logger().info('Done') self._update_table_with_issues(issues, repo_name_for_index, ingest=ingest) else: # update table if needed issues_to_update = [] issues_paginated_list = repo_obj.get_issues(state='all') counter = 1 for issue in issues_paginated_list: if issue.pull_request: continue issue_str, comments, number = self._process_issue(issue) issue_key = f"issue_{number}" issue_id = issue_key + "." + "issue" res = self.table.search().limit(len(self.table)).where(f"id='{issue_id}'").to_list() is_new_issue = True for r in res: if r['metadata']['repo'] == repo_name_for_index: is_new_issue = False break if is_new_issue: counter += 1 issues_to_update.append(issue) else: break if issues_to_update: get_logger().info(f'Updating index with {counter} new issues...') self._update_table_with_issues(issues_to_update, repo_name_for_index, ingest=True) else: get_logger().info('No new issues to update') elif get_settings().pr_similar_issue.vectordb == "qdrant": try: import qdrant_client from qdrant_client.models import (Distance, FieldCondition, Filter, MatchValue, PointStruct, VectorParams) except Exception: raise Exception("Please install qdrant-client to use qdrant as vectordb") api_key = None url = None try: api_key = get_settings().qdrant.api_key url = get_settings().qdrant.url except Exception: if not self.cli_mode: repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) issue_main.create_comment("Please set qdrant url and api key in secrets file") raise Exception("Please set qdrant url and api key in secrets file") self.qdrant = qdrant_client.QdrantClient(url=url, api_key=api_key) run_from_scratch = False ingest = True if not self.qdrant.collection_exists(collection_name=self.index_name): run_from_scratch = True ingest = False self.qdrant.create_collection( collection_name=self.index_name, vectors_config=VectorParams(size=1536, distance=Distance.COSINE), ) else: if get_settings().pr_similar_issue.force_update_dataset: ingest = True else: response = self.qdrant.count( collection_name=self.index_name, count_filter=Filter(must=[ FieldCondition(key="metadata.repo", match=MatchValue(value=repo_name_for_index)), FieldCondition(key="id", match=MatchValue(value=f"example_issue_{repo_name_for_index}")), ]), ) ingest = True if response.count == 0 else False if run_from_scratch or ingest: get_logger().info('Indexing the entire repo...') get_logger().info('Getting issues...') issues = list(repo_obj.get_issues(state='all')) get_logger().info('Done') self._update_qdrant_with_issues(issues, repo_name_for_index, ingest=ingest) else: issues_to_update = [] issues_paginated_list = repo_obj.get_issues(state='all') counter = 1 for issue in issues_paginated_list: if issue.pull_request: continue issue_str, comments, number = self._process_issue(issue) issue_key = f"issue_{number}" point_id = issue_key + "." + "issue" response = self.qdrant.count( collection_name=self.index_name, count_filter=Filter(must=[ FieldCondition(key="id", match=MatchValue(value=point_id)), FieldCondition(key="metadata.repo", match=MatchValue(value=repo_name_for_index)), ]), ) if response.count == 0: counter += 1 issues_to_update.append(issue) else: break if issues_to_update: get_logger().info(f'Updating index with {counter} new issues...') self._update_qdrant_with_issues(issues_to_update, repo_name_for_index, ingest=True) else: get_logger().info('No new issues to update') async def run(self): if not self.supported: message = "The /similar_issue tool is currently supported only for GitHub." if get_settings().config.publish_output: try: from pr_agent.git_providers import get_git_provider_with_context provider = get_git_provider_with_context(self.issue_url) provider.publish_comment(message) except Exception as e: get_logger().warning( "Failed to publish /similar_issue unsupported message", artifact={"error": str(e)}, ) return "" get_logger().info('Getting issue...') repo_name, original_issue_number = self.git_provider._parse_issue_url(self.issue_url.split('=')[-1]) issue_main = self.git_provider.repo_obj.get_issue(original_issue_number) issue_str, comments, number = self._process_issue(issue_main) openai.api_key = get_settings().openai.key get_logger().info('Done') get_logger().info('Querying...') res = openai.Embedding.create(input=[issue_str], engine=MODEL) embeds = [record['embedding'] for record in res['data']] relevant_issues_number_list = [] relevant_comment_number_list = [] score_list = [] if get_settings().pr_similar_issue.vectordb == "pinecone": pinecone_index = pinecone.Index(index_name=self.index_name) res = pinecone_index.query(embeds[0], top_k=5, filter={"repo": self.repo_name_for_index}, include_metadata=True).to_dict() for r in res['matches']: # skip example issue if 'example_issue_' in r["id"]: continue try: issue_number = int(r["id"].split('.')[0].split('_')[-1]) except: get_logger().debug(f"Failed to parse issue number from {r['id']}") continue if original_issue_number == issue_number: continue if issue_number not in relevant_issues_number_list: relevant_issues_number_list.append(issue_number) if 'comment' in r["id"]: relevant_comment_number_list.append(int(r["id"].split('.')[1].split('_')[-1])) else: relevant_comment_number_list.append(-1) score_list.append(str("{:.2f}".format(r['score']))) get_logger().info('Done') elif get_settings().pr_similar_issue.vectordb == "lancedb": res = self.table.search(embeds[0]).where(f"metadata.repo='{self.repo_name_for_index}'", prefilter=True).to_list() for r in res: # skip example issue if 'example_issue_' in r["id"]: continue try: issue_number = int(r["id"].split('.')[0].split('_')[-1]) except: get_logger().debug(f"Failed to parse issue number from {r['id']}") continue if original_issue_number == issue_number: continue if issue_number not in relevant_issues_number_list: relevant_issues_number_list.append(issue_number) if 'comment' in r["id"]: relevant_comment_number_list.append(int(r["id"].split('.')[1].split('_')[-1])) else: relevant_comment_number_list.append(-1) score_list.append(str("{:.2f}".format(1-r['_distance']))) get_logger().info('Done') elif get_settings().pr_similar_issue.vectordb == "qdrant": from qdrant_client.models import FieldCondition, Filter, MatchValue res = self.qdrant.search( collection_name=self.index_name, query_vector=embeds[0], limit=5, query_filter=Filter(must=[FieldCondition(key="metadata.repo", match=MatchValue(value=self.repo_name_for_index))]), with_payload=True, ) for r in res: rid = r.payload.get("id", "") if 'example_issue_' in rid: continue try: issue_number = int(rid.split('.')[0].split('_')[-1]) except Exception: get_logger().debug(f"Failed to parse issue number from {rid}") continue if original_issue_number == issue_number: continue if issue_number not in relevant_issues_number_list: relevant_issues_number_list.append(issue_number) if 'comment' in rid: relevant_comment_number_list.append(int(rid.split('.')[1].split('_')[-1])) else: relevant_comment_number_list.append(-1) score_list.append(str("{:.2f}".format(r.score))) get_logger().info('Done') get_logger().info('Publishing response...') similar_issues_str = "### Similar Issues\n___\n\n" for i, issue_number_similar in enumerate(relevant_issues_number_list): issue = self.git_provider.repo_obj.get_issue(issue_number_similar) title = issue.title url = issue.html_url if relevant_comment_number_list[i] != -1: url = list(issue.get_comments())[relevant_comment_number_list[i]].html_url similar_issues_str += f"{i + 1}. **[{title}]({url})** (score={score_list[i]})\n\n" if get_settings().config.publish_output: response = issue_main.create_comment(similar_issues_str) get_logger().info(similar_issues_str) get_logger().info('Done') def _process_issue(self, issue): header = issue.title body = issue.body number = issue.number if get_settings().pr_similar_issue.skip_comments: comments = [] else: comments = list(issue.get_comments()) issue_str = f"Issue Header: \"{header}\"\n\nIssue Body:\n{body}" return issue_str, comments, number def _update_index_with_issues(self, issues_list, repo_name_for_index, upsert=False): get_logger().info('Processing issues...') corpus = Corpus() example_issue_record = Record( id=f"example_issue_{repo_name_for_index}", text="example_issue", metadata=Metadata(repo=repo_name_for_index) ) corpus.append(example_issue_record) counter = 0 for issue in issues_list: if issue.pull_request: continue counter += 1 if counter % 100 == 0: get_logger().info(f"Scanned {counter} issues") if counter >= self.max_issues_to_scan: get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") break issue_str, comments, number = self._process_issue(issue) issue_key = f"issue_{number}" username = issue.user.login created_at = str(issue.created_at) if len(issue_str) < 8000 or \ self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): # fast reject first issue_record = Record( id=issue_key + "." + "issue", text=issue_str, metadata=Metadata(repo=repo_name_for_index, username=username, created_at=created_at, level=IssueLevel.ISSUE) ) corpus.append(issue_record) if comments: for j, comment in enumerate(comments): comment_body = comment.body num_words_comment = len(comment_body.split()) if num_words_comment < 10 or not isinstance(comment_body, str): continue if len(comment_body) < 8000 or \ self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: comment_record = Record( id=issue_key + ".comment_" + str(j + 1), text=comment_body, metadata=Metadata(repo=repo_name_for_index, username=username, # use issue username for all comments created_at=created_at, level=IssueLevel.COMMENT) ) corpus.append(comment_record) df = pd.DataFrame(corpus.model_dump()["documents"]) get_logger().info('Done') get_logger().info('Embedding...') openai.api_key = get_settings().openai.key list_to_encode = list(df["text"].values) try: res = openai.Embedding.create(input=list_to_encode, engine=MODEL) embeds = [record['embedding'] for record in res['data']] except: embeds = [] get_logger().error('Failed to embed entire list, embedding one by one...') for i, text in enumerate(list_to_encode): try: res = openai.Embedding.create(input=[text], engine=MODEL) embeds.append(res['data'][0]['embedding']) except: embeds.append([0] * 1536) df["values"] = embeds meta = DatasetMetadata.empty() meta.dense_model.dimension = len(embeds[0]) ds = Dataset.from_pandas(df, meta) get_logger().info('Done') api_key = get_settings().pinecone.api_key environment = get_settings().pinecone.environment if not upsert: get_logger().info('Creating index from scratch...') ds.to_pinecone_index(self.index_name, api_key=api_key, environment=environment) time.sleep(15) # wait for pinecone to finalize indexing before querying else: get_logger().info('Upserting index...') namespace = "" batch_size: int = 100 concurrency: int = 10 pinecone.init(api_key=api_key, environment=environment) ds._upsert_to_index(self.index_name, namespace, batch_size, concurrency) time.sleep(5) # wait for pinecone to finalize upserting before querying get_logger().info('Done') def _update_table_with_issues(self, issues_list, repo_name_for_index, ingest=False): get_logger().info('Processing issues...') corpus = Corpus() example_issue_record = Record( id=f"example_issue_{repo_name_for_index}", text="example_issue", metadata=Metadata(repo=repo_name_for_index) ) corpus.append(example_issue_record) counter = 0 for issue in issues_list: if issue.pull_request: continue counter += 1 if counter % 100 == 0: get_logger().info(f"Scanned {counter} issues") if counter >= self.max_issues_to_scan: get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") break issue_str, comments, number = self._process_issue(issue) issue_key = f"issue_{number}" username = issue.user.login created_at = str(issue.created_at) if len(issue_str) < 8000 or \ self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): # fast reject first issue_record = Record( id=issue_key + "." + "issue", text=issue_str, metadata=Metadata(repo=repo_name_for_index, username=username, created_at=created_at, level=IssueLevel.ISSUE) ) corpus.append(issue_record) if comments: for j, comment in enumerate(comments): comment_body = comment.body num_words_comment = len(comment_body.split()) if num_words_comment < 10 or not isinstance(comment_body, str): continue if len(comment_body) < 8000 or \ self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: comment_record = Record( id=issue_key + ".comment_" + str(j + 1), text=comment_body, metadata=Metadata(repo=repo_name_for_index, username=username, # use issue username for all comments created_at=created_at, level=IssueLevel.COMMENT) ) corpus.append(comment_record) df = pd.DataFrame(corpus.model_dump()["documents"]) get_logger().info('Done') get_logger().info('Embedding...') openai.api_key = get_settings().openai.key list_to_encode = list(df["text"].values) try: res = openai.Embedding.create(input=list_to_encode, engine=MODEL) embeds = [record['embedding'] for record in res['data']] except: embeds = [] get_logger().error('Failed to embed entire list, embedding one by one...') for i, text in enumerate(list_to_encode): try: res = openai.Embedding.create(input=[text], engine=MODEL) embeds.append(res['data'][0]['embedding']) except: embeds.append([0] * 1536) df["vector"] = embeds get_logger().info('Done') if not ingest: get_logger().info('Creating table from scratch...') self.table = self.db.create_table(self.index_name, data=df, mode="overwrite") time.sleep(15) else: get_logger().info('Ingesting in Table...') if self.index_name not in self.db.table_names(): self.table.add(df) else: get_logger().info(f"Table {self.index_name} doesn't exists!") time.sleep(5) get_logger().info('Done') def _update_qdrant_with_issues(self, issues_list, repo_name_for_index, ingest=False): try: import uuid import pandas as pd from qdrant_client.models import PointStruct except Exception: raise get_logger().info('Processing issues...') corpus = Corpus() example_issue_record = Record( id=f"example_issue_{repo_name_for_index}", text="example_issue", metadata=Metadata(repo=repo_name_for_index) ) corpus.append(example_issue_record) counter = 0 for issue in issues_list: if issue.pull_request: continue counter += 1 if counter % 100 == 0: get_logger().info(f"Scanned {counter} issues") if counter >= self.max_issues_to_scan: get_logger().info(f"Scanned {self.max_issues_to_scan} issues, stopping") break issue_str, comments, number = self._process_issue(issue) issue_key = f"issue_{number}" username = issue.user.login created_at = str(issue.created_at) if len(issue_str) < 8000 or \ self.token_handler.count_tokens(issue_str) < get_max_tokens(MODEL): issue_record = Record( id=issue_key + "." + "issue", text=issue_str, metadata=Metadata(repo=repo_name_for_index, username=username, created_at=created_at, level=IssueLevel.ISSUE) ) corpus.append(issue_record) if comments: for j, comment in enumerate(comments): comment_body = comment.body num_words_comment = len(comment_body.split()) if num_words_comment < 10 or not isinstance(comment_body, str): continue if len(comment_body) < 8000 or \ self.token_handler.count_tokens(comment_body) < MAX_TOKENS[MODEL]: comment_record = Record( id=issue_key + ".comment_" + str(j + 1), text=comment_body, metadata=Metadata(repo=repo_name_for_index, username=username, created_at=created_at, level=IssueLevel.COMMENT) ) corpus.append(comment_record) df = pd.DataFrame(corpus.model_dump()["documents"]) get_logger().info('Done') get_logger().info('Embedding...') openai.api_key = get_settings().openai.key list_to_encode = list(df["text"].values) try: res = openai.Embedding.create(input=list_to_encode, engine=MODEL) embeds = [record['embedding'] for record in res['data']] except Exception: embeds = [] get_logger().error('Failed to embed entire list, embedding one by one...') for i, text in enumerate(list_to_encode): try: res = openai.Embedding.create(input=[text], engine=MODEL) embeds.append(res['data'][0]['embedding']) except Exception: embeds.append([0] * 1536) df["vector"] = embeds get_logger().info('Done') get_logger().info('Upserting into Qdrant...') points = [] for row in df.to_dict(orient="records"): points.append( PointStruct(id=uuid.uuid5(uuid.NAMESPACE_DNS, row["id"]).hex, vector=row["vector"], payload={"id": row["id"], "text": row["text"], "metadata": row["metadata"]}) ) self.qdrant.upsert(collection_name=self.index_name, points=points) get_logger().info('Done') class IssueLevel(str, Enum): ISSUE = "issue" COMMENT = "comment" class Metadata(BaseModel): repo: str username: str = Field(default="@codium") created_at: str = Field(default="01-01-1970 00:00:00.00000") level: IssueLevel = Field(default=IssueLevel.ISSUE) class Config: use_enum_values = True class Record(BaseModel): id: str text: str metadata: Metadata class Corpus(BaseModel): documents: List[Record] = Field(default=[]) def append(self, r: Record): self.documents.append(r) ================================================ FILE: pr_agent/tools/pr_update_changelog.py ================================================ import copy from datetime import date from functools import partial from time import sleep from typing import Tuple from jinja2 import Environment, StrictUndefined from pr_agent.algo.ai_handlers.base_ai_handler import BaseAiHandler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler from pr_agent.algo.pr_processing import get_pr_diff, retry_with_fallback_models from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import ModelType, show_relevant_configurations from pr_agent.config_loader import get_settings from pr_agent.git_providers import GithubProvider, get_git_provider from pr_agent.git_providers.git_provider import get_main_pr_language from pr_agent.log import get_logger CHANGELOG_LINES = 50 class PRUpdateChangelog: def __init__(self, pr_url: str, cli_mode=False, args=None, ai_handler: partial[BaseAiHandler,] = LiteLLMAIHandler): self.git_provider = get_git_provider()(pr_url) self.main_language = get_main_pr_language( self.git_provider.get_languages(), self.git_provider.get_files() ) self.commit_changelog = get_settings().pr_update_changelog.push_changelog_changes self._get_changelog_file() # self.changelog_file_str self.ai_handler = ai_handler() self.ai_handler.main_pr_language = self.main_language self.patches_diff = None self.prediction = None self.cli_mode = cli_mode self.vars = { "title": self.git_provider.pr.title, "branch": self.git_provider.get_pr_branch(), "description": self.git_provider.get_pr_description(), "language": self.main_language, "diff": "", # empty diff for initial calculation "pr_link": "", "changelog_file_str": self.changelog_file_str, "today": date.today(), "extra_instructions": get_settings().pr_update_changelog.extra_instructions, "commit_messages_str": self.git_provider.get_commit_messages(), } self.token_handler = TokenHandler(self.git_provider.pr, self.vars, get_settings().pr_update_changelog_prompt.system, get_settings().pr_update_changelog_prompt.user) async def run(self): get_logger().info('Updating the changelog...') relevant_configs = {'pr_update_changelog': dict(get_settings().pr_update_changelog), 'config': dict(get_settings().config)} get_logger().debug("Relevant configs", artifacts=relevant_configs) # check if the git provider supports pushing changelog changes if get_settings().pr_update_changelog.push_changelog_changes and not hasattr( self.git_provider, "create_or_update_pr_file" ): get_logger().error( "Pushing changelog changes is not currently supported for this code platform" ) if get_settings().config.publish_output: self.git_provider.publish_comment( "Pushing changelog changes is not currently supported for this code platform" ) return if get_settings().config.publish_output: self.git_provider.publish_comment("Preparing changelog updates...", is_temporary=True) await retry_with_fallback_models(self._prepare_prediction, model_type=ModelType.WEAK) new_file_content, answer = self._prepare_changelog_update() # Output the relevant configurations if enabled if get_settings().get('config', {}).get('output_relevant_configurations', False): answer += show_relevant_configurations(relevant_section='pr_update_changelog') get_logger().debug(f"PR output", artifact=answer) if get_settings().config.publish_output: self.git_provider.remove_initial_comment() if self.commit_changelog: self._push_changelog_update(new_file_content, answer) else: self.git_provider.publish_comment(f"**Changelog updates:** 🔄\n\n{answer}") async def _prepare_prediction(self, model: str): self.patches_diff = get_pr_diff(self.git_provider, self.token_handler, model) if self.patches_diff: get_logger().debug(f"PR diff", artifact=self.patches_diff) self.prediction = await self._get_prediction(model) else: get_logger().error(f"Error getting PR diff") self.prediction = "" async def _get_prediction(self, model: str): variables = copy.deepcopy(self.vars) variables["diff"] = self.patches_diff # update diff if get_settings().pr_update_changelog.add_pr_link: variables["pr_link"] = self.git_provider.get_pr_url() environment = Environment(undefined=StrictUndefined) system_prompt = environment.from_string(get_settings().pr_update_changelog_prompt.system).render(variables) user_prompt = environment.from_string(get_settings().pr_update_changelog_prompt.user).render(variables) response, finish_reason = await self.ai_handler.chat_completion( model=model, system=system_prompt, user=user_prompt, temperature=get_settings().config.temperature) # post-process the response response = response.strip() if not response: return "" if response.startswith("```"): response_lines = response.splitlines() response_lines = response_lines[1:] response = "\n".join(response_lines) response = response.strip("`") return response def _prepare_changelog_update(self) -> Tuple[str, str]: answer = self.prediction.strip().strip("```").strip() # noqa B005 if hasattr(self, "changelog_file"): existing_content = self.changelog_file else: existing_content = "" if existing_content: new_file_content = answer + "\n\n" + self.changelog_file else: new_file_content = answer if not self.commit_changelog: answer += "\n\n\n>to commit the new content to the CHANGELOG.md file, please type:" \ "\n>'/update_changelog --pr_update_changelog.push_changelog_changes=true'\n" return new_file_content, answer def _push_changelog_update(self, new_file_content, answer): if get_settings().pr_update_changelog.get("skip_ci_on_push", True): commit_message = "[skip ci] Update CHANGELOG.md" else: commit_message = "Update CHANGELOG.md" self.git_provider.create_or_update_pr_file( file_path="CHANGELOG.md", branch=self.git_provider.get_pr_branch(), contents=new_file_content, message=commit_message, ) sleep(5) # wait for the file to be updated try: if get_settings().config.git_provider == "github": last_commit_id = list(self.git_provider.pr.get_commits())[-1] d = dict( body="CHANGELOG.md update", path="CHANGELOG.md", line=max(2, len(answer.splitlines())), start_line=1, ) self.git_provider.pr.create_review(commit=last_commit_id, comments=[d]) except Exception: # we can't create a review for some reason, let's just publish a comment self.git_provider.publish_comment(f"**Changelog updates: 🔄**\n\n{answer}") def _get_default_changelog(self): example_changelog = \ """ Example: ## <current_date> ### Added ... ### Changed ... ### Fixed ... """ return example_changelog def _get_changelog_file(self): try: self.changelog_file = self.git_provider.get_pr_file_content( "CHANGELOG.md", self.git_provider.get_pr_branch() ) if isinstance(self.changelog_file, bytes): self.changelog_file = self.changelog_file.decode('utf-8') changelog_file_lines = self.changelog_file.splitlines() changelog_file_lines = changelog_file_lines[:CHANGELOG_LINES] self.changelog_file_str = "\n".join(changelog_file_lines) except Exception as e: get_logger().warning(f"Error getting changelog file: {e}") self.changelog_file_str = "" self.changelog_file = "" return if not self.changelog_file_str: self.changelog_file_str = self._get_default_changelog() ================================================ FILE: pr_agent/tools/ticket_pr_compliance_check.py ================================================ import re import traceback from pr_agent.config_loader import get_settings from pr_agent.git_providers import GithubProvider from pr_agent.git_providers import AzureDevopsProvider from pr_agent.log import get_logger # Compile the regex pattern once, outside the function GITHUB_TICKET_PATTERN = re.compile( r'(https://github[^/]+/[^/]+/[^/]+/issues/\d+)|(\b(\w+)/(\w+)#(\d+)\b)|(#\d+)' ) # Option A: issue number at start of branch or after /, followed by - or end (e.g. feature/1-test-issue, 123-fix) BRANCH_ISSUE_PATTERN = re.compile(r"(?:^|/)(\d{1,6})(?=-|$)") def find_jira_tickets(text): # Regular expression patterns for JIRA tickets patterns = [ r'\b[A-Z]{2,10}-\d{1,7}\b', # Standard JIRA ticket format (e.g., PROJ-123) r'(?:https?://[^\s/]+/browse/)?([A-Z]{2,10}-\d{1,7})\b' # JIRA URL or just the ticket ] tickets = set() for pattern in patterns: matches = re.findall(pattern, text) for match in matches: if isinstance(match, tuple): # If it's a tuple (from the URL pattern), take the last non-empty group ticket = next((m for m in reversed(match) if m), None) else: ticket = match if ticket: tickets.add(ticket) return list(tickets) def extract_ticket_links_from_pr_description(pr_description, repo_path, base_url_html='https://github.com'): """ Extract all ticket links from PR description """ github_tickets = set() try: # Use the updated pattern to find matches matches = GITHUB_TICKET_PATTERN.findall(pr_description) for match in matches: if match[0]: # Full URL match github_tickets.add(match[0]) elif match[1]: # Shorthand notation match: owner/repo#issue_number owner, repo, issue_number = match[2], match[3], match[4] github_tickets.add(f'{base_url_html.strip("/")}/{owner}/{repo}/issues/{issue_number}') else: # #123 format issue_number = match[5][1:] # remove # if issue_number.isdigit() and len(issue_number) < 5 and repo_path: github_tickets.add(f'{base_url_html.strip("/")}/{repo_path}/issues/{issue_number}') if len(github_tickets) > 3: get_logger().info(f"Too many tickets found in PR description: {len(github_tickets)}") # Limit the number of tickets to 3 github_tickets = set(list(github_tickets)[:3]) except Exception as e: get_logger().error(f"Error extracting tickets error= {e}", artifact={"traceback": traceback.format_exc()}) return list(github_tickets) def extract_ticket_links_from_branch_name(branch_name, repo_path, base_url_html="https://github.com"): """ Extract GitHub issue URLs from branch name. Numbers are matched at start of branch or after /, followed by - or end (e.g. feature/1-test-issue -> #1). Respects extract_issue_from_branch and optional branch_issue_regex (may be under [config] in TOML). """ if not branch_name or not repo_path: return [] if not isinstance(branch_name, str): return [] settings = get_settings() if not settings.get("extract_issue_from_branch", settings.get("config.extract_issue_from_branch", True)): return [] github_tickets = set() custom_regex_str = settings.get("branch_issue_regex") or settings.get("config.branch_issue_regex", "") or "" if custom_regex_str: try: pattern = re.compile(custom_regex_str) if pattern.groups < 1: get_logger().error( "branch_issue_regex must contain at least one capturing group for the issue number; using default pattern." ) pattern = BRANCH_ISSUE_PATTERN except re.error as e: get_logger().error(f"Invalid custom regex for branch issue extraction: {e}") return [] else: pattern = BRANCH_ISSUE_PATTERN for match in pattern.finditer(branch_name): try: issue_number = match.group(1) except IndexError: continue if issue_number and issue_number.isdigit(): github_tickets.add( f"{base_url_html.strip('/')}/{repo_path}/issues/{issue_number}" ) return list(github_tickets) async def extract_tickets(git_provider): MAX_TICKET_CHARACTERS = 10000 try: if isinstance(git_provider, GithubProvider): user_description = git_provider.get_user_description() description_tickets = extract_ticket_links_from_pr_description( user_description, git_provider.repo, git_provider.base_url_html ) branch_name = git_provider.get_pr_branch() branch_tickets = extract_ticket_links_from_branch_name( branch_name, git_provider.repo, git_provider.base_url_html ) seen = set() merged = [] for link in description_tickets + branch_tickets: if link not in seen: seen.add(link) merged.append(link) if len(merged) > 3: get_logger().info(f"Too many tickets (description + branch): {len(merged)}") tickets = merged[:3] else: tickets = merged tickets_content = [] if tickets: for ticket in tickets: repo_name, original_issue_number = git_provider._parse_issue_url(ticket) try: issue_main = git_provider.repo_obj.get_issue(original_issue_number) except Exception as e: get_logger().error(f"Error getting main issue: {e}", artifact={"traceback": traceback.format_exc()}) continue issue_body_str = issue_main.body or "" if len(issue_body_str) > MAX_TICKET_CHARACTERS: issue_body_str = issue_body_str[:MAX_TICKET_CHARACTERS] + "..." # Extract sub-issues sub_issues_content = [] try: sub_issues = git_provider.fetch_sub_issues(ticket) for sub_issue_url in sub_issues: try: sub_repo, sub_issue_number = git_provider._parse_issue_url(sub_issue_url) sub_issue = git_provider.repo_obj.get_issue(sub_issue_number) sub_body = sub_issue.body or "" if len(sub_body) > MAX_TICKET_CHARACTERS: sub_body = sub_body[:MAX_TICKET_CHARACTERS] + "..." sub_issues_content.append({ 'ticket_url': sub_issue_url, 'title': sub_issue.title, 'body': sub_body }) except Exception as e: get_logger().warning(f"Failed to fetch sub-issue content for {sub_issue_url}: {e}") except Exception as e: get_logger().warning(f"Failed to fetch sub-issues for {ticket}: {e}") # Extract labels labels = [] try: for label in issue_main.labels: labels.append(label.name if hasattr(label, 'name') else label) except Exception as e: get_logger().error(f"Error extracting labels error= {e}", artifact={"traceback": traceback.format_exc()}) tickets_content.append({ 'ticket_id': issue_main.number, 'ticket_url': ticket, 'title': issue_main.title, 'body': issue_body_str, 'labels': ", ".join(labels), 'sub_issues': sub_issues_content # Store sub-issues content }) return tickets_content elif isinstance(git_provider, AzureDevopsProvider): tickets_info = git_provider.get_linked_work_items() tickets_content = [] for ticket in tickets_info: try: ticket_body_str = ticket.get("body", "") if len(ticket_body_str) > MAX_TICKET_CHARACTERS: ticket_body_str = ticket_body_str[:MAX_TICKET_CHARACTERS] + "..." tickets_content.append( { "ticket_id": ticket.get("id"), "ticket_url": ticket.get("url"), "title": ticket.get("title"), "body": ticket_body_str, "requirements": ticket.get("acceptance_criteria", ""), "labels": ", ".join(ticket.get("labels", [])), } ) except Exception as e: get_logger().error( f"Error processing Azure DevOps ticket: {e}", artifact={"traceback": traceback.format_exc()}, ) return tickets_content except Exception as e: get_logger().error(f"Error extracting tickets error= {e}", artifact={"traceback": traceback.format_exc()}) async def extract_and_cache_pr_tickets(git_provider, vars): if not get_settings().get('pr_reviewer.require_ticket_analysis_review', False): return related_tickets = get_settings().get('related_tickets', []) if not related_tickets: tickets_content = await extract_tickets(git_provider) if tickets_content: # Store sub-issues along with main issues for ticket in tickets_content: if "sub_issues" in ticket and ticket["sub_issues"]: for sub_issue in ticket["sub_issues"]: related_tickets.append(sub_issue) # Add sub-issues content related_tickets.append(ticket) get_logger().info("Extracted tickets and sub-issues from PR description", artifact={"tickets": related_tickets}) vars['related_tickets'] = related_tickets get_settings().set('related_tickets', related_tickets) else: get_logger().info("Using cached tickets", artifact={"tickets": related_tickets}) vars['related_tickets'] = related_tickets def check_tickets_relevancy(): return True ================================================ FILE: pr_compliance_checklist.yaml ================================================ pr_compliances: - title: "Consistent Naming Conventions" compliance_label: false objective: "All new variables, functions, and classes must follow the project's established naming standards" success_criteria: "All identifiers follow the established naming patterns (camelCase, snake_case, etc.)" failure_criteria: "Inconsistent or non-standard naming that deviates from project conventions" - title: "No Dead or Commented-Out Code" compliance_label: false objective: "Keep the codebase clean by ensuring all submitted code is active and necessary" success_criteria: "All code in the PR is active and serves a purpose; no commented-out blocks" failure_criteria: "Presence of unused, dead, or commented-out code sections" - title: "Robust Error Handling" compliance_label: false objective: "Ensure potential errors and edge cases are anticipated and handled gracefully throughout the code" success_criteria: "All error scenarios are properly caught and handled with appropriate responses" failure_criteria: "Unhandled exceptions, ignored errors, or missing edge case handling" - title: "Single Responsibility for Functions" compliance_label: false objective: "Each function should have a single, well-defined responsibility" success_criteria: "Functions perform one cohesive task with a single purpose" failure_criteria: "Functions that combine multiple unrelated operations or handle several distinct concerns" - title: "When relevant, utilize early return" compliance_label: false objective: "In a code snippet containing multiple logic conditions (such as 'if-else'), prefer an early return on edge cases than deep nesting" success_criteria: "When relevant, utilize early return that reduces nesting" failure_criteria: "Unjustified deep nesting that can be simplified by early return" ================================================ FILE: pyproject.toml ================================================ [build-system] requires = ["setuptools>=61.0", "wheel"] build-backend = "setuptools.build_meta" [project] name = "pr-agent" version = "0.3.1" authors = [{ name = "QodoAI", email = "ofir.f@qodo.ai" }] maintainers = [ { name = "Ofir Friedman", email = "ofir.f@qodo.ai" }, ] description = "QodoAI PR-Agent aims to help efficiently review and handle pull requests, by providing AI feedbacks and suggestions." readme = "README.md" requires-python = ">=3.12" keywords = ["AI", "Agents", "Pull Request", "Automation", "Code Review"] license = { file = "LICENSE" } classifiers = [ "Intended Audience :: Developers", "Programming Language :: Python :: 3", ] dynamic = ["dependencies"] [tool.setuptools.dynamic] dependencies = { file = ["requirements.txt"] } [project.urls] "Homepage" = "https://github.com/qodo-ai/pr-agent" "Documentation" = "https://qodo-merge-docs.qodo.ai/" [tool.setuptools] include-package-data = true [tool.setuptools.packages.find] where = ["."] include = [ "pr_agent*", ] # include pr_agent and any sub-packages it finds under it. [project.scripts] pr-agent = "pr_agent.cli:run" [tool.ruff] line-length = 120 lint.select = [ "E", # Pyflakes "F", # Pyflakes "B", # flake8-bugbear "I001", # isort basic checks "I002", # isort missing-required-import ] # First commit - only fixing isort lint.fixable = [ "I001", # isort basic checks ] lint.unfixable = [ "B", # Avoid trying to fix flake8-bugbear (`B`) violations. ] lint.exclude = ["api/code_completions"] lint.ignore = ["E999", "B008"] [tool.ruff.lint.per-file-ignores] "__init__.py" = [ "E402", ] # Ignore `E402` (import violations) in all `__init__.py` files, and in `path/to/file.py`. [tool.bandit] exclude_dirs = ["tests"] skips = ["B101"] tests = [] [tool.pytest.ini_options] asyncio_mode = "auto" testpaths = ["tests"] python_files = ["test_*.py"] python_classes = ["Test*"] python_functions = ["test_*"] addopts = "--color=yes" console_output_style = "progress" ================================================ FILE: requirements-dev.txt ================================================ pytest==9.0.2 pytest-asyncio>=1.3.0 poetry twine pre-commit>=4,<5 ================================================ FILE: requirements.txt ================================================ aiohttp==3.12.15 anthropic>=0.69.0 #anthropic[vertex]==0.47.1 atlassian-python-api==3.41.4 azure-devops==7.1.0b4 azure-identity==1.25.0 boto3==1.40.45 certifi==2024.8.30 dynaconf==3.2.4 fastapi==0.118.0 GitPython==3.1.41 google-cloud-aiplatform==1.38.0 google-generativeai==0.8.3 google-cloud-storage==2.10.0 Jinja2==3.1.6 litellm==1.81.12 loguru==0.7.2 msrest==0.7.1 openai>=1.55.3 pytest==9.0.2 pytest-asyncio>=1.3.0 PyGithub==1.59.* PyJWT==2.10.1 PyYAML==6.0.1 python-gitlab==3.15.0 retry==0.9.2 starlette-context==0.3.6 tiktoken==0.8.0 ujson==5.8.0 uvicorn==0.22.0 tenacity==8.2.3 gunicorn==23.0.0 pytest-cov==7.0.0 pydantic==2.8.2 html2text==2024.2.26 giteapy==1.0.8 # Uncomment the following lines to enable the 'similar issue' tool # pinecone-client # pinecone-datasets @ git+https://github.com/mrT23/pinecone-datasets.git@main # lancedb==0.5.1 # qdrant-client==1.15.1 # uncomment this to support language LangChainOpenAIHandler # langchain==0.2.0 # langchain-core==0.2.28 # langchain-openai==0.1.20 ================================================ FILE: setup.py ================================================ # for compatibility with legacy tools # see: https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html from setuptools import setup setup() ================================================ FILE: tests/e2e_tests/e2e_utils.py ================================================ FILE_PATH = "pr_agent/cli_pip.py" PR_HEADER_START_WITH = '### **User description**\nupdate cli_pip.py\n\n\n___\n\n### **PR Type**' REVIEW_START_WITH = '## PR Reviewer Guide 🔍\n\n<table>\n<tr><td>⏱️ <strong>Estimated effort to review</strong>:' IMPROVE_START_WITH_REGEX_PATTERN = r'^## PR Code Suggestions ✨\n\n<!-- [a-z0-9]+ -->\n\n<table><thead><tr><td>Category</td>' NUM_MINUTES = 5 NEW_FILE_CONTENT = """\ from pr_agent import cli from pr_agent.config_loader import get_settings def main(): # Fill in the following values provider = "github" # GitHub provider user_token = "..." # GitHub user token openai_key = "ghs_afsdfasdfsdf" # Example OpenAI key pr_url = "..." # PR URL, for example 'https://github.com/Codium-ai/pr-agent/pull/809' command = "/improve" # Command to run (e.g. '/review', '/describe', 'improve', '/ask="What is the purpose of this PR?"') # Setting the configurations get_settings().set("CONFIG.git_provider", provider) get_settings().set("openai.key", openai_key) get_settings().set("github.user_token", user_token) # Run the command. Feedback will appear in GitHub PR comments output = cli.run_command(pr_url, command) print(output) if __name__ == '__main__': main() """ ================================================ FILE: tests/e2e_tests/langchain_ai_handler.py ================================================ import asyncio import os import time from pr_agent.algo.ai_handlers.langchain_ai_handler import LangChainOpenAIHandler from pr_agent.config_loader import get_settings def check_settings(): print('Checking settings...') settings = get_settings() # Check OpenAI settings if not hasattr(settings, 'openai'): print('OpenAI settings not found') return False if not hasattr(settings.openai, 'key'): print('OpenAI API key not found') return False print('OpenAI API key found') return True async def measure_performance(handler, num_requests=3): print(f'\nRunning performance test with {num_requests} requests...') start_time = time.time() # Create multiple requests tasks = [ handler.chat_completion( model='gpt-3.5-turbo', system='You are a helpful assistant', user=f'Test message {i}', temperature=0.2 ) for i in range(num_requests) ] # Execute requests concurrently responses = await asyncio.gather(*tasks) end_time = time.time() total_time = end_time - start_time avg_time = total_time / num_requests print(f'Performance results:') print(f'Total time: {total_time:.2f} seconds') print(f'Average time per request: {avg_time:.2f} seconds') print(f'Requests per second: {num_requests/total_time:.2f}') return responses async def test(): print('Starting test...') # Check settings first if not check_settings(): print('Please set up your environment variables or configuration file') print('Required: OPENAI_API_KEY') return try: handler = LangChainOpenAIHandler() print('Handler created') # Basic functionality test response = await handler.chat_completion( model='gpt-3.5-turbo', system='You are a helpful assistant', user='Hello', temperature=0.2, img_path='test.jpg' ) print('Response:', response) # Performance test await measure_performance(handler) except Exception as e: print('Error:', str(e)) print('Error type:', type(e)) print('Error details:', e.__dict__ if hasattr(e, '__dict__') else 'No additional details') if __name__ == '__main__': print('Environment variables:') print('OPENAI_API_KEY:', 'Set' if os.getenv('OPENAI_API_KEY') else 'Not set') print('OPENAI_API_TYPE:', os.getenv('OPENAI_API_TYPE', 'Not set')) print('OPENAI_API_BASE:', os.getenv('OPENAI_API_BASE', 'Not set')) asyncio.run(test()) ================================================ FILE: tests/e2e_tests/test_bitbucket_app.py ================================================ import hashlib import os import re import time from datetime import datetime import jwt import requests from atlassian.bitbucket import Cloud from requests.auth import HTTPBasicAuth from pr_agent.config_loader import get_settings from pr_agent.log import get_logger, setup_logger from tests.e2e_tests.e2e_utils import ( FILE_PATH, IMPROVE_START_WITH_REGEX_PATTERN, NEW_FILE_CONTENT, NUM_MINUTES, PR_HEADER_START_WITH, REVIEW_START_WITH, ) log_level = os.environ.get("LOG_LEVEL", "INFO") setup_logger(log_level) logger = get_logger() def test_e2e_run_bitbucket_app(): repo_slug = 'pr-agent-tests' project_key = 'codiumai' base_branch = "main" # or any base branch you want new_branch = f"bitbucket_app_e2e_test-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" get_settings().config.git_provider = "bitbucket" try: # Add username and password for authentication username = get_settings().get("BITBUCKET.USERNAME", None) password = get_settings().get("BITBUCKET.PASSWORD", None) s = requests.Session() s.auth = (username, password) # Use HTTP Basic Auth bitbucket_client = Cloud(session=s) repo = bitbucket_client.workspaces.get(workspace=project_key).repositories.get(repo_slug) # Create a new branch from the base branch logger.info(f"Creating a new branch {new_branch} from {base_branch}") source_branch = repo.branches.get(base_branch) target_repo = repo.branches.create(new_branch,source_branch.hash) # Update the file content url = f"https://api.bitbucket.org/2.0/repositories/{project_key}/{repo_slug}/src" files={FILE_PATH: NEW_FILE_CONTENT} data={ "message": "update cli_pip.py", "branch": new_branch, } requests.request("POST", url, auth=HTTPBasicAuth(username, password), data=data, files=files) # Create a pull request logger.info(f"Creating a pull request from {new_branch} to {base_branch}") pr = repo.pullrequests.create( title=f'{new_branch}', description="update cli_pip.py", source_branch=new_branch, destination_branch=base_branch ) # check every 1 minute, for 5 minutes if the PR has all the tool results for i in range(NUM_MINUTES): logger.info(f"Waiting for the PR to get all the tool results...") time.sleep(60) comments = list(pr.comments()) comments_raw = [c.raw for c in comments] if len(comments) >= 5: # header, 3 suggestions, 1 review valid_review = False for comment_raw in comments_raw: if comment_raw.startswith('## PR Reviewer Guide 🔍'): valid_review = True break if valid_review: break else: logger.error(f"REVIEW feedback is invalid") raise Exception("REVIEW feedback is invalid") else: logger.info(f"Waiting for the PR to get all the tool results. {i + 1} minute(s) passed") else: assert False, f"After {NUM_MINUTES} minutes, the PR did not get all the tool results" # cleanup - delete the branch pr.decline() repo.branches.delete(new_branch) # If we reach here, the test is successful logger.info(f"Succeeded in running e2e test for Bitbucket app on the PR") except Exception as e: logger.error(f"Failed to run e2e test for Bitbucket app: {e}") # delete the branch pr.decline() repo.branches.delete(new_branch) assert False if __name__ == '__main__': test_e2e_run_bitbucket_app() ================================================ FILE: tests/e2e_tests/test_gitea_app.py ================================================ import os import time from datetime import datetime import requests from pr_agent.config_loader import get_settings from pr_agent.log import get_logger, setup_logger from tests.e2e_tests.e2e_utils import ( FILE_PATH, IMPROVE_START_WITH_REGEX_PATTERN, NEW_FILE_CONTENT, NUM_MINUTES, PR_HEADER_START_WITH, REVIEW_START_WITH, ) log_level = os.environ.get("LOG_LEVEL", "INFO") setup_logger(log_level) logger = get_logger() def test_e2e_run_gitea_app(): repo_name = 'pr-agent-tests' owner = 'codiumai' base_branch = "main" new_branch = f"gitea_app_e2e_test-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" get_settings().config.git_provider = "gitea" headers = None pr_number = None try: gitea_url = get_settings().get("GITEA.URL", None) gitea_token = get_settings().get("GITEA.TOKEN", None) if not gitea_url: logger.error("GITEA.URL is not set in the configuration") logger.info("Please set GITEA.URL in .env file or environment variables") assert False, "GITEA.URL is not set in the configuration" if not gitea_token: logger.error("GITEA.TOKEN is not set in the configuration") logger.info("Please set GITEA.TOKEN in .env file or environment variables") assert False, "GITEA.TOKEN is not set in the configuration" headers = { 'Authorization': f'token {gitea_token}', 'Content-Type': 'application/json', 'Accept': 'application/json' } logger.info(f"Creating a new branch {new_branch} from {base_branch}") response = requests.get( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/branches/{base_branch}", headers=headers ) response.raise_for_status() base_branch_data = response.json() base_commit_sha = base_branch_data['commit']['id'] branch_data = { 'ref': f"refs/heads/{new_branch}", 'sha': base_commit_sha } response = requests.post( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/git/refs", headers=headers, json=branch_data ) response.raise_for_status() logger.info(f"Updating file {FILE_PATH} in branch {new_branch}") import base64 file_content_encoded = base64.b64encode(NEW_FILE_CONTENT.encode()).decode() try: response = requests.get( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/contents/{FILE_PATH}?ref={new_branch}", headers=headers ) response.raise_for_status() existing_file = response.json() file_sha = existing_file.get('sha') file_data = { 'message': 'Update cli_pip.py', 'content': file_content_encoded, 'sha': file_sha, 'branch': new_branch } except: file_data = { 'message': 'Add cli_pip.py', 'content': file_content_encoded, 'branch': new_branch } response = requests.put( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/contents/{FILE_PATH}", headers=headers, json=file_data ) response.raise_for_status() logger.info(f"Creating a pull request from {new_branch} to {base_branch}") pr_data = { 'title': f'Test PR from {new_branch}', 'body': 'update cli_pip.py', 'head': new_branch, 'base': base_branch } response = requests.post( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/pulls", headers=headers, json=pr_data ) response.raise_for_status() pr = response.json() pr_number = pr['number'] for i in range(NUM_MINUTES): logger.info(f"Waiting for the PR to get all the tool results...") time.sleep(60) response = requests.get( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/issues/{pr_number}/comments", headers=headers ) response.raise_for_status() comments = response.json() if len(comments) >= 5: valid_review = False for comment in comments: if comment['body'].startswith('## PR Reviewer Guide 🔍'): valid_review = True break if valid_review: break else: logger.error("REVIEW feedback is invalid") raise Exception("REVIEW feedback is invalid") else: logger.info(f"Waiting for the PR to get all the tool results. {i + 1} minute(s) passed") else: assert False, f"After {NUM_MINUTES} minutes, the PR did not get all the tool results" logger.info(f"Cleaning up: closing PR and deleting branch {new_branch}") close_data = {'state': 'closed'} response = requests.patch( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/pulls/{pr_number}", headers=headers, json=close_data ) response.raise_for_status() response = requests.delete( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/git/refs/heads/{new_branch}", headers=headers ) response.raise_for_status() logger.info(f"Succeeded in running e2e test for Gitea app on the PR") except Exception as e: logger.error(f"Failed to run e2e test for Gitea app: {e}") raise finally: try: if headers is None or gitea_url is None: return if pr_number is not None: requests.patch( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/pulls/{pr_number}", headers=headers, json={'state': 'closed'} ) requests.delete( f"{gitea_url}/api/v1/repos/{owner}/{repo_name}/git/refs/heads/{new_branch}", headers=headers ) except Exception as cleanup_error: logger.error(f"Failed to clean up after test: {cleanup_error}") if __name__ == '__main__': test_e2e_run_gitea_app() ================================================ FILE: tests/e2e_tests/test_github_app.py ================================================ import os import re import time from datetime import datetime from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.log import get_logger, setup_logger from tests.e2e_tests.e2e_utils import ( FILE_PATH, IMPROVE_START_WITH_REGEX_PATTERN, NEW_FILE_CONTENT, NUM_MINUTES, PR_HEADER_START_WITH, REVIEW_START_WITH, ) log_level = os.environ.get("LOG_LEVEL", "INFO") setup_logger(log_level) logger = get_logger() def test_e2e_run_github_app(): """ What we want to do: (1) open a PR in a repo 'https://github.com/Codium-ai/pr-agent-tests' (2) wait for 5 minutes until the PR is processed by the GitHub app (3) check that the relevant tools have been executed """ base_branch = "main" # or any base branch you want new_branch = f"github_app_e2e_test-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" repo_url = 'Codium-ai/pr-agent-tests' get_settings().config.git_provider = "github" git_provider = get_git_provider()() github_client = git_provider.github_client repo = github_client.get_repo(repo_url) try: # Create a new branch from the base branch source = repo.get_branch(base_branch) logger.info(f"Creating a new branch {new_branch} from {base_branch}") repo.create_git_ref(ref=f"refs/heads/{new_branch}", sha=source.commit.sha) # Get the file you want to edit file = repo.get_contents(FILE_PATH, ref=base_branch) # content = file.decoded_content.decode() # Update the file content logger.info(f"Updating the file {FILE_PATH}") commit_message = "update cli_pip.py" repo.update_file( file.path, commit_message, NEW_FILE_CONTENT, file.sha, branch=new_branch ) # Create a pull request logger.info(f"Creating a pull request from {new_branch} to {base_branch}") pr = repo.create_pull( title=new_branch, body="update cli_pip.py", head=new_branch, base=base_branch ) # check every 1 minute, for 5, minutes if the PR has all the tool results for i in range(NUM_MINUTES): logger.info(f"Waiting for the PR to get all the tool results...") time.sleep(60) logger.info(f"Checking the PR {pr.html_url} after {i + 1} minute(s)") pr.update() pr_header_body = pr.body comments = list(pr.get_issue_comments()) if len(comments) == 2: comments_body = [comment.body for comment in comments] assert pr_header_body.startswith(PR_HEADER_START_WITH), "DESCRIBE feedback is invalid" assert comments_body[0].startswith(REVIEW_START_WITH), "REVIEW feedback is invalid" assert re.match(IMPROVE_START_WITH_REGEX_PATTERN, comments_body[1]), "IMPROVE feedback is invalid" break else: logger.info(f"Waiting for the PR to get all the tool results. {i + 1} minute(s) passed") else: assert False, f"After {NUM_MINUTES} minutes, the PR did not get all the tool results" # cleanup - delete the branch logger.info(f"Deleting the branch {new_branch}") repo.get_git_ref(f"heads/{new_branch}").delete() # If we reach here, the test is successful logger.info(f"Succeeded in running e2e test for GitHub app on the PR {pr.html_url}") except Exception as e: logger.error(f"Failed to run e2e test for GitHub app: {e}") # delete the branch logger.info(f"Deleting the branch {new_branch}") repo.get_git_ref(f"heads/{new_branch}").delete() assert False if __name__ == '__main__': test_e2e_run_github_app() ================================================ FILE: tests/e2e_tests/test_gitlab_webhook.py ================================================ import os import re import time from datetime import datetime import gitlab from pr_agent.config_loader import get_settings from pr_agent.git_providers import get_git_provider from pr_agent.log import get_logger, setup_logger from tests.e2e_tests.e2e_utils import ( FILE_PATH, IMPROVE_START_WITH_REGEX_PATTERN, NEW_FILE_CONTENT, NUM_MINUTES, PR_HEADER_START_WITH, REVIEW_START_WITH, ) log_level = os.environ.get("LOG_LEVEL", "INFO") setup_logger(log_level) logger = get_logger() def test_e2e_run_github_app(): # GitLab setup GITLAB_URL = "https://gitlab.com" GITLAB_TOKEN = get_settings().gitlab.PERSONAL_ACCESS_TOKEN gl = gitlab.Gitlab(GITLAB_URL, private_token=GITLAB_TOKEN) repo_url = 'codiumai/pr-agent-tests' project = gl.projects.get(repo_url) base_branch = "main" # or any base branch you want new_branch = f"github_app_e2e_test-{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}" try: # Create a new branch from the base branch logger.info(f"Creating a new branch {new_branch} from {base_branch}") project.branches.create({'branch': new_branch, 'ref': base_branch}) # Get the file you want to edit file = project.files.get(file_path=FILE_PATH, ref=base_branch) # content = file.decode() # Update the file content logger.info(f"Updating the file {FILE_PATH}") commit_message = "update cli_pip.py" file.content = NEW_FILE_CONTENT file.save(branch=new_branch, commit_message=commit_message) # Create a merge request logger.info(f"Creating a merge request from {new_branch} to {base_branch}") mr = project.mergerequests.create({ 'source_branch': new_branch, 'target_branch': base_branch, 'title': new_branch, 'description': "update cli_pip.py" }) logger.info(f"Merge request created: {mr.web_url}") # check every 1 minute, for 5, minutes if the PR has all the tool results for i in range(NUM_MINUTES): logger.info(f"Waiting for the MR to get all the tool results...") time.sleep(60) logger.info(f"Checking the MR {mr.web_url} after {i + 1} minute(s)") mr = project.mergerequests.get(mr.iid) mr_header_body = mr.description comments = mr.notes.list()[::-1] # clean all system comments comments = [comment for comment in comments if comment.system is False] if len(comments) == 2: # "changed the description" is received as the first comment comments_body = [comment.body for comment in comments] if 'Work in progress' in comments_body[1]: continue assert mr_header_body.startswith(PR_HEADER_START_WITH), "DESCRIBE feedback is invalid" assert comments_body[0].startswith(REVIEW_START_WITH), "REVIEW feedback is invalid" assert re.match(IMPROVE_START_WITH_REGEX_PATTERN, comments_body[1]), "IMPROVE feedback is invalid" break else: logger.info(f"Waiting for the MR to get all the tool results. {i + 1} minute(s) passed") else: assert False, f"After {NUM_MINUTES} minutes, the MR did not get all the tool results" # cleanup - delete the branch logger.info(f"Deleting the branch {new_branch}") project.branches.delete(new_branch) # If we reach here, the test is successful logger.info(f"Succeeded in running e2e test for GitLab app on the MR {mr.web_url}") except Exception as e: logger.error(f"Failed to run e2e test for GitHub app: {e}") logger.info(f"Deleting the branch {new_branch}") project.branches.delete(new_branch) assert False if __name__ == '__main__': test_e2e_run_github_app() ================================================ FILE: tests/health_test/main.py ================================================ import argparse import asyncio import copy import os from pathlib import Path from starlette_context import context, request_cycle_context from pr_agent.agent.pr_agent import PRAgent, commands from pr_agent.cli import run_command from pr_agent.config_loader import get_settings, global_settings from pr_agent.log import get_logger, setup_logger from tests.e2e_tests import e2e_utils log_level = os.environ.get("LOG_LEVEL", "INFO") setup_logger(log_level) async def run_async(): pr_url = os.getenv('TEST_PR_URL', 'https://github.com/Codium-ai/pr-agent/pull/1385') get_settings().set("config.git_provider", "github") get_settings().set("config.publish_output", False) get_settings().set("config.fallback_models", []) agent = PRAgent() try: # Run the 'describe' command get_logger().info(f"\nSanity check for the 'describe' command...") original_settings = copy.deepcopy(get_settings()) await agent.handle_request(pr_url, ['describe']) pr_header_body = dict(get_settings().data)['artifact'] assert pr_header_body.startswith('###') and 'PR Type' in pr_header_body and 'Description' in pr_header_body context['settings'] = copy.deepcopy(original_settings) # Restore settings state after each test to prevent test interference get_logger().info("PR description generated successfully\n") # Run the 'review' command get_logger().info(f"\nSanity check for the 'review' command...") original_settings = copy.deepcopy(get_settings()) await agent.handle_request(pr_url, ['review']) pr_review_body = dict(get_settings().data)['artifact'] assert pr_review_body.startswith('##') and 'PR Reviewer Guide' in pr_review_body context['settings'] = copy.deepcopy(original_settings) # Restore settings state after each test to prevent test interference get_logger().info("PR review generated successfully\n") # Run the 'improve' command get_logger().info(f"\nSanity check for the 'improve' command...") original_settings = copy.deepcopy(get_settings()) await agent.handle_request(pr_url, ['improve']) pr_improve_body = dict(get_settings().data)['artifact'] assert pr_improve_body.startswith('##') and 'PR Code Suggestions' in pr_improve_body context['settings'] = copy.deepcopy(original_settings) # Restore settings state after each test to prevent test interference get_logger().info("PR improvements generated successfully\n") get_logger().info(f"\n\n========\nHealth test passed successfully\n========") except Exception as e: get_logger().exception(f"\n\n========\nHealth test failed\n========") raise e def run(): with request_cycle_context({}): context['settings'] = copy.deepcopy(global_settings) asyncio.run(run_async()) if __name__ == '__main__': run() ================================================ FILE: tests/unittest/test_add_docs_trigger.py ================================================ import pytest from pr_agent.agent.pr_agent import PRAgent from pr_agent.config_loader import get_settings from pr_agent.identity_providers import get_identity_provider from pr_agent.identity_providers.identity_provider import Eligibility from pr_agent.servers.github_app import handle_new_pr_opened from pr_agent.tools.pr_add_docs import PRAddDocs @pytest.mark.asyncio @pytest.mark.parametrize( "action,draft,state,should_run", [ ("opened", False, "open", True), ("edited", False, "open", False), ("opened", True, "open", False), ("opened", False, "closed", False), ], ) async def test_add_docs_trigger(monkeypatch, action, draft, state, should_run): # Mock settings to enable the "/add_docs" auto-command on PR opened settings = get_settings() settings.github_app.pr_commands = ["/add_docs"] settings.github_app.handle_pr_actions = ["opened"] # Define a FakeGitProvider for both apply_repo_settings and PRAddDocs class FakeGitProvider: def __init__(self, pr_url, *args, **kwargs): self.pr = type("pr", (), {"title": "Test PR"})() self.get_pr_branch = lambda: "test-branch" self.get_pr_description = lambda: "desc" self.get_languages = lambda: ["Python"] self.get_files = lambda: [] self.get_commit_messages = lambda: "msg" self.publish_comment = lambda *args, **kwargs: None self.remove_initial_comment = lambda: None self.publish_code_suggestions = lambda suggestions: True self.diff_files = [] self.get_repo_settings = lambda: {} # Patch Git provider lookups monkeypatch.setattr( "pr_agent.git_providers.utils.get_git_provider_with_context", lambda pr_url: FakeGitProvider(pr_url), ) monkeypatch.setattr( "pr_agent.tools.pr_add_docs.get_git_provider", lambda: FakeGitProvider, ) # Ensure identity provider always eligible monkeypatch.setattr( get_identity_provider().__class__, "verify_eligibility", lambda *args, **kwargs: Eligibility.ELIGIBLE, ) # Spy on PRAddDocs.run() ran = {"flag": False} async def fake_run(self): ran["flag"] = True monkeypatch.setattr(PRAddDocs, "run", fake_run) # Build minimal PR payload body = { "action": action, "pull_request": { "url": "https://example.com/fake/pr", "state": state, "draft": draft, }, } log_context = {} # Invoke the PR-open handler agent = PRAgent() await handle_new_pr_opened( body=body, event="pull_request", sender="tester", sender_id="123", action=action, log_context=log_context, agent=agent, ) assert ran["flag"] is should_run, ( f"Expected run() to be {'called' if should_run else 'skipped'}" f" for action={action!r}, draft={draft}, state={state!r}" ) ================================================ FILE: tests/unittest/test_aws_secrets_manager_provider.py ================================================ import json from unittest.mock import MagicMock, patch import pytest from botocore.exceptions import ClientError from pr_agent.secret_providers.aws_secrets_manager_provider import AWSSecretsManagerProvider class TestAWSSecretsManagerProvider: def _provider(self): """Create provider following existing pattern""" with patch('pr_agent.secret_providers.aws_secrets_manager_provider.get_settings') as mock_get_settings, \ patch('pr_agent.secret_providers.aws_secrets_manager_provider.boto3.client') as mock_boto3_client: settings = MagicMock() settings.get.side_effect = lambda k, d=None: { 'aws_secrets_manager.secret_arn': 'arn:aws:secretsmanager:us-east-1:123456789012:secret:test-secret', 'aws_secrets_manager.region_name': 'us-east-1', 'aws.AWS_REGION_NAME': 'us-east-1' }.get(k, d) settings.aws_secrets_manager.secret_arn = 'arn:aws:secretsmanager:us-east-1:123456789012:secret:test-secret' mock_get_settings.return_value = settings # Mock boto3 client mock_client = MagicMock() mock_boto3_client.return_value = mock_client provider = AWSSecretsManagerProvider() provider.client = mock_client # Set client directly for testing return provider, mock_client # Positive test cases def test_get_secret_success(self): provider, mock_client = self._provider() mock_client.get_secret_value.return_value = {'SecretString': 'test-secret-value'} result = provider.get_secret('test-secret-name') assert result == 'test-secret-value' mock_client.get_secret_value.assert_called_once_with(SecretId='test-secret-name') def test_get_all_secrets_success(self): provider, mock_client = self._provider() secret_data = {'openai.key': 'sk-test', 'github.webhook_secret': 'webhook-secret'} mock_client.get_secret_value.return_value = {'SecretString': json.dumps(secret_data)} result = provider.get_all_secrets() assert result == secret_data # Negative test cases (following Google Cloud Storage pattern) def test_get_secret_failure(self): provider, mock_client = self._provider() mock_client.get_secret_value.side_effect = Exception("AWS error") result = provider.get_secret('nonexistent-secret') assert result == "" # Confirm empty string is returned def test_get_all_secrets_failure(self): provider, mock_client = self._provider() mock_client.get_secret_value.side_effect = Exception("AWS error") result = provider.get_all_secrets() assert result == {} # Confirm empty dictionary is returned def test_store_secret_update_existing(self): provider, mock_client = self._provider() mock_client.update_secret.return_value = {} provider.store_secret('test-secret', 'test-value') mock_client.put_secret_value.assert_called_once_with( SecretId='test-secret', SecretString='test-value' ) def test_init_failure_invalid_config(self): with patch('pr_agent.secret_providers.aws_secrets_manager_provider.get_settings') as mock_get_settings: settings = MagicMock() settings.aws_secrets_manager.secret_arn = None # Configuration error mock_get_settings.return_value = settings with pytest.raises(Exception): AWSSecretsManagerProvider() def test_store_secret_failure(self): provider, mock_client = self._provider() mock_client.put_secret_value.side_effect = Exception("AWS error") with pytest.raises(Exception): provider.store_secret('test-secret', 'test-value') ================================================ FILE: tests/unittest/test_azure_devops_comment.py ================================================ import unittest from unittest.mock import MagicMock, patch from pr_agent.config_loader import get_settings from pr_agent.git_providers import AzureDevopsProvider class TestAzureDevopsProviderPublishComment(unittest.TestCase): @patch("pr_agent.git_providers.azuredevops_provider.get_settings") def test_publish_comment_default_closed(self, mock_get_settings): # Simulate config with no default_comment_status mock_settings = MagicMock() mock_settings.azure_devops.get.return_value = "closed" mock_settings.config.publish_output_progress = True mock_get_settings.return_value = mock_settings with patch.object(AzureDevopsProvider, "_get_azure_devops_client", return_value=(MagicMock(), MagicMock())): provider = AzureDevopsProvider() provider.workspace_slug = "ws" provider.repo_slug = "repo" provider.pr_num = 1 # Patch CommentThread and create_thread with patch("pr_agent.git_providers.azuredevops_provider.CommentThread") as MockThread: provider.azure_devops_client.create_thread.return_value.comments = [MagicMock()] provider.azure_devops_client.create_thread.return_value.comments[0].thread_id = 123 provider.azure_devops_client.create_thread.return_value.id = 123 provider.publish_comment("test comment") args, kwargs = MockThread.call_args assert kwargs.get("status") == "closed" @patch("pr_agent.git_providers.azuredevops_provider.get_settings") def test_publish_comment_active(self, mock_get_settings): # Simulate config with default_comment_status = "active" mock_settings = MagicMock() mock_settings.azure_devops.get.return_value = "active" mock_settings.config.publish_output_progress = True mock_get_settings.return_value = mock_settings with patch.object(AzureDevopsProvider, "_get_azure_devops_client", return_value=(MagicMock(), MagicMock())): provider = AzureDevopsProvider() provider.workspace_slug = "ws" provider.repo_slug = "repo" provider.pr_num = 1 # Patch CommentThread and create_thread with patch("pr_agent.git_providers.azuredevops_provider.CommentThread") as MockThread: provider.azure_devops_client.create_thread.return_value.comments = [MagicMock()] provider.azure_devops_client.create_thread.return_value.comments[0].thread_id = 123 provider.azure_devops_client.create_thread.return_value.id = 123 provider.publish_comment("test comment") args, kwargs = MockThread.call_args assert kwargs.get("status") == "active" def test_default_comment_status_from_config_file(self): # Import get_settings directly to read from configuration.toml status = get_settings().azure_devops.default_comment_status # The expected value should match what's in your configuration.toml self.assertEqual(status, "closed") ================================================ FILE: tests/unittest/test_azure_devops_parsing.py ================================================ from pr_agent.git_providers import AzureDevopsProvider class TestAzureDevOpsParsing: def test_regular_address(self): pr_url = "https://dev.azure.com/organization/project/_git/repo/pullrequest/1" # workspace_slug, repo_slug, pr_number assert AzureDevopsProvider._parse_pr_url(pr_url) == ("project", "repo", 1) def test_visualstudio_address(self): pr_url = "https://organization.visualstudio.com/project/_git/repo/pullrequest/1" # workspace_slug, repo_slug, pr_number assert AzureDevopsProvider._parse_pr_url(pr_url) == ("project", "repo", 1) def test_self_hosted_address(self): pr_url = "http://server.be:8080/tfs/department/project/_git/repo/pullrequest/1" # workspace_slug, repo_slug, pr_number assert AzureDevopsProvider._parse_pr_url(pr_url) == ("project", "repo", 1) ================================================ FILE: tests/unittest/test_bitbucket_provider.py ================================================ from unittest.mock import MagicMock from atlassian.bitbucket import Bitbucket from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from pr_agent.git_providers import BitbucketServerProvider from pr_agent.git_providers.bitbucket_provider import BitbucketProvider class TestBitbucketProvider: def test_parse_pr_url(self): url = "https://bitbucket.org/WORKSPACE_XYZ/MY_TEST_REPO/pull-requests/321" workspace_slug, repo_slug, pr_number = BitbucketProvider._parse_pr_url(url) assert workspace_slug == "WORKSPACE_XYZ" assert repo_slug == "MY_TEST_REPO" assert pr_number == 321 class TestBitbucketServerProvider: def test_parse_pr_url(self): url = "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1" workspace_slug, repo_slug, pr_number = BitbucketServerProvider._parse_pr_url(url) assert workspace_slug == "AAA" assert repo_slug == "my-repo" assert pr_number == 1 def test_parse_pr_url_with_users(self): url = "https://bitbucket.company-server.url/users/username/repos/my-repo/pull-requests/1" workspace_slug, repo_slug, pr_number = BitbucketServerProvider._parse_pr_url(url) assert workspace_slug == "~username" assert repo_slug == "my-repo" assert pr_number == 1 def mock_get_content_of_file(self, project_key, repository_slug, filename, at=None, markup=None): content_map = { '9c1cffdd9f276074bfb6fb3b70fbee62d298b058': 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile\n', '2a1165446bdf991caf114d01f7c88d84ae7399cf': 'file\nwith\nmultiple \nlines\nto\nemulate\na\nfake\nfile\n', 'f617708826cdd0b40abb5245eda71630192a17e3': 'file\nwith\nmultiple \nlines\nto\nemulate\na\nreal\nfile\n', 'cb68a3027d6dda065a7692ebf2c90bed1bcdec28': 'file\nwith\nsome\nchanges\nto\nemulate\na\nreal\nfile\n', '1905dcf16c0aac6ac24f7ab617ad09c73dc1d23b': 'file\nwith\nsome\nlines\nto\nemulate\na\nfake\ntest\n', 'ae4eca7f222c96d396927d48ab7538e2ee13ca63': 'readme\nwithout\nsome\nlines\nto\nsimulate\na\nreal\nfile', '548f8ba15abc30875a082156314426806c3f4d97': 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile', '0e898cb355a5170d8c8771b25d43fcaa1d2d9489': 'file\nwith\nmultiple\nlines\nto\nemulate\na\nreal\nfile' } return content_map.get(at, '') def mock_get_from_bitbucket_60(self, url): response_map = { "rest/api/1.0/application-properties": { "version": "6.0" } } return response_map.get(url, '') def mock_get_from_bitbucket_70(self, url): response_map = { "rest/api/1.0/application-properties": { "version": "7.0" } } return response_map.get(url, '') def mock_get_from_bitbucket_816(self, url): response_map = { "rest/api/1.0/application-properties": { "version": "8.16" }, "rest/api/latest/projects/AAA/repos/my-repo/pull-requests/1/merge-base": { 'id': '548f8ba15abc30875a082156314426806c3f4d97' } } return response_map.get(url, '') ''' tests the 2-way diff functionality where the diff should be between the HEAD of branch b and node c NOT between the HEAD of main and the HEAD of branch b - o branch b / o - o - o main ^ node c ''' def test_get_diff_files_simple_diverge_70(self): bitbucket_client = MagicMock(Bitbucket) bitbucket_client.get_pull_request.return_value = { 'toRef': {'latestCommit': '9c1cffdd9f276074bfb6fb3b70fbee62d298b058'}, 'fromRef': {'latestCommit': '2a1165446bdf991caf114d01f7c88d84ae7399cf'} } bitbucket_client.get_pull_requests_commits.return_value = [ {'id': '2a1165446bdf991caf114d01f7c88d84ae7399cf', 'parents': [{'id': 'f617708826cdd0b40abb5245eda71630192a17e3'}]} ] bitbucket_client.get_commits.return_value = [ {'id': '9c1cffdd9f276074bfb6fb3b70fbee62d298b058'}, {'id': 'dbca09554567d2e4bee7f07993390153280ee450'} ] bitbucket_client.get_pull_requests_changes.return_value = [ { 'path': {'toString': 'Readme.md'}, 'type': 'MODIFY', } ] bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_70 bitbucket_client.get_content_of_file.side_effect = self.mock_get_content_of_file provider = BitbucketServerProvider( "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", bitbucket_client=bitbucket_client ) expected = [ FilePatchInfo( 'file\nwith\nmultiple \nlines\nto\nemulate\na\nreal\nfile\n', 'file\nwith\nmultiple \nlines\nto\nemulate\na\nfake\nfile\n', '--- \n+++ \n@@ -5,5 +5,5 @@\n to\n emulate\n a\n-real\n+fake\n file\n', 'Readme.md', edit_type=EDIT_TYPE.MODIFIED, ) ] actual = provider.get_diff_files() assert actual == expected ''' tests the 2-way diff functionality where the diff should be between the HEAD of branch b and node c NOT between the HEAD of main and the HEAD of branch b - o - o - o branch b / / o - o -- o - o main ^ node c ''' def test_get_diff_files_diverge_with_merge_commit_70(self): bitbucket_client = MagicMock(Bitbucket) bitbucket_client.get_pull_request.return_value = { 'toRef': {'latestCommit': 'cb68a3027d6dda065a7692ebf2c90bed1bcdec28'}, 'fromRef': {'latestCommit': '1905dcf16c0aac6ac24f7ab617ad09c73dc1d23b'} } bitbucket_client.get_pull_requests_commits.return_value = [ {'id': '1905dcf16c0aac6ac24f7ab617ad09c73dc1d23b', 'parents': [{'id': '692772f456c3db77a90b11ce39ea516f8c2bad93'}]}, {'id': '692772f456c3db77a90b11ce39ea516f8c2bad93', 'parents': [ {'id': '2a1165446bdf991caf114d01f7c88d84ae7399cf'}, {'id': '9c1cffdd9f276074bfb6fb3b70fbee62d298b058'}, ]}, {'id': '2a1165446bdf991caf114d01f7c88d84ae7399cf', 'parents': [{'id': 'f617708826cdd0b40abb5245eda71630192a17e3'}]} ] bitbucket_client.get_commits.return_value = [ {'id': 'cb68a3027d6dda065a7692ebf2c90bed1bcdec28'}, {'id': '9c1cffdd9f276074bfb6fb3b70fbee62d298b058'}, {'id': 'dbca09554567d2e4bee7f07993390153280ee450'} ] bitbucket_client.get_pull_requests_changes.return_value = [ { 'path': {'toString': 'Readme.md'}, 'type': 'MODIFY', } ] bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_70 bitbucket_client.get_content_of_file.side_effect = self.mock_get_content_of_file provider = BitbucketServerProvider( "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", bitbucket_client=bitbucket_client ) expected = [ FilePatchInfo( 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile\n', 'file\nwith\nsome\nlines\nto\nemulate\na\nfake\ntest\n', '--- \n+++ \n@@ -5,5 +5,5 @@\n to\n emulate\n a\n-real\n-file\n+fake\n+test\n', 'Readme.md', edit_type=EDIT_TYPE.MODIFIED, ) ] actual = provider.get_diff_files() assert actual == expected ''' tests the 2-way diff functionality where the diff should be between the HEAD of branch c and node d NOT between the HEAD of main and the HEAD of branch c ---- o - o branch c / / ---- o branch b / / o - o - o main ^ node d ''' def get_multi_merge_diverge_mock_client(self, api_version): bitbucket_client = MagicMock(Bitbucket) bitbucket_client.get_pull_request.return_value = { 'toRef': {'latestCommit': '9569922b22fe4fd0968be6a50ed99f71efcd0504'}, 'fromRef': {'latestCommit': 'ae4eca7f222c96d396927d48ab7538e2ee13ca63'} } bitbucket_client.get_pull_requests_commits.return_value = [ {'id': 'ae4eca7f222c96d396927d48ab7538e2ee13ca63', 'parents': [{'id': 'bbf300fb3af5129af8c44659f8cc7a526a6a6f31'}]}, {'id': 'bbf300fb3af5129af8c44659f8cc7a526a6a6f31', 'parents': [ {'id': '10b7b8e41cb370b48ceda8da4e7e6ad033182213'}, {'id': 'd1bb183c706a3ebe4c2b1158c25878201a27ad8c'}, ]}, {'id': 'd1bb183c706a3ebe4c2b1158c25878201a27ad8c', 'parents': [ {'id': '5bd76251866cb415fc5ff232f63a581e89223bda'}, {'id': '548f8ba15abc30875a082156314426806c3f4d97'} ]}, {'id': '5bd76251866cb415fc5ff232f63a581e89223bda', 'parents': [{'id': '0e898cb355a5170d8c8771b25d43fcaa1d2d9489'}]}, {'id': '10b7b8e41cb370b48ceda8da4e7e6ad033182213', 'parents': [{'id': '0e898cb355a5170d8c8771b25d43fcaa1d2d9489'}]} ] bitbucket_client.get_commits.return_value = [ {'id': '9569922b22fe4fd0968be6a50ed99f71efcd0504'}, {'id': '548f8ba15abc30875a082156314426806c3f4d97'} ] bitbucket_client.get_pull_requests_changes.return_value = [ { 'path': {'toString': 'Readme.md'}, 'type': 'MODIFY', } ] bitbucket_client.get_content_of_file.side_effect = self.mock_get_content_of_file if api_version == 60: bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_60 elif api_version == 70: bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_70 elif api_version == 816: bitbucket_client.get.side_effect = self.mock_get_from_bitbucket_816 return bitbucket_client def test_get_diff_files_multi_merge_diverge_60(self): bitbucket_client = self.get_multi_merge_diverge_mock_client(60) provider = BitbucketServerProvider( "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", bitbucket_client=bitbucket_client ) expected = [ FilePatchInfo( 'file\nwith\nmultiple\nlines\nto\nemulate\na\nreal\nfile', 'readme\nwithout\nsome\nlines\nto\nsimulate\na\nreal\nfile', '--- \n+++ \n@@ -1,9 +1,9 @@\n-file\n-with\n-multiple\n+readme\n+without\n+some\n lines\n to\n-emulate\n+simulate\n a\n real\n file\n', 'Readme.md', edit_type=EDIT_TYPE.MODIFIED, ) ] actual = provider.get_diff_files() assert actual == expected def test_get_diff_files_multi_merge_diverge_70(self): bitbucket_client = self.get_multi_merge_diverge_mock_client(70) provider = BitbucketServerProvider( "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", bitbucket_client=bitbucket_client ) expected = [ FilePatchInfo( 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile', 'readme\nwithout\nsome\nlines\nto\nsimulate\na\nreal\nfile', '--- \n+++ \n@@ -1,9 +1,9 @@\n-file\n-with\n+readme\n+without\n some\n lines\n to\n-emulate\n+simulate\n a\n real\n file\n', 'Readme.md', edit_type=EDIT_TYPE.MODIFIED, ) ] actual = provider.get_diff_files() assert actual == expected def test_get_diff_files_multi_merge_diverge_816(self): bitbucket_client = self.get_multi_merge_diverge_mock_client(816) provider = BitbucketServerProvider( "https://git.onpreminstance.com/projects/AAA/repos/my-repo/pull-requests/1", bitbucket_client=bitbucket_client ) expected = [ FilePatchInfo( 'file\nwith\nsome\nlines\nto\nemulate\na\nreal\nfile', 'readme\nwithout\nsome\nlines\nto\nsimulate\na\nreal\nfile', '--- \n+++ \n@@ -1,9 +1,9 @@\n-file\n-with\n+readme\n+without\n some\n lines\n to\n-emulate\n+simulate\n a\n real\n file\n', 'Readme.md', edit_type=EDIT_TYPE.MODIFIED, ) ] actual = provider.get_diff_files() assert actual == expected ================================================ FILE: tests/unittest/test_clip_tokens.py ================================================ from unittest.mock import MagicMock, patch import pytest from pr_agent.algo.token_handler import TokenEncoder from pr_agent.algo.utils import clip_tokens class TestClipTokens: """Comprehensive test suite for the clip_tokens function.""" def test_empty_input_text(self): """Test that empty input returns empty string.""" assert clip_tokens("", 10) == "" assert clip_tokens(None, 10) is None def test_text_under_token_limit(self): """Test that text under the token limit is returned unchanged.""" text = "Short text" max_tokens = 100 result = clip_tokens(text, max_tokens) assert result == text def test_text_exactly_at_token_limit(self): """Test text that is exactly at the token limit.""" text = "This is exactly at the limit" # Mock the token encoder to return exact limit with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 10 # Exactly 10 tokens mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, 10) assert result == text def test_text_over_token_limit_with_three_dots(self): """Test text over token limit with three dots addition.""" text = "This is a longer text that should be clipped when it exceeds the token limit" max_tokens = 5 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, max_tokens) assert result.endswith("\n...(truncated)") assert len(result) < len(text) def test_text_over_token_limit_without_three_dots(self): """Test text over token limit without three dots addition.""" text = "This is a longer text that should be clipped" max_tokens = 5 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, max_tokens, add_three_dots=False) assert not result.endswith("\n...(truncated)") assert len(result) < len(text) def test_negative_max_tokens(self): """Test that negative max_tokens returns empty string.""" text = "Some text" result = clip_tokens(text, -1) assert result == "" result = clip_tokens(text, -100) assert result == "" def test_zero_max_tokens(self): """Test that zero max_tokens returns empty string.""" text = "Some text" result = clip_tokens(text, 0) assert result == "" def test_delete_last_line_functionality(self): """Test the delete_last_line parameter functionality.""" text = "Line 1\nLine 2\nLine 3\nLine 4" max_tokens = 5 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens mock_encoder.return_value = mock_tokenizer # Without delete_last_line result_normal = clip_tokens(text, max_tokens, delete_last_line=False) # With delete_last_line result_deleted = clip_tokens(text, max_tokens, delete_last_line=True) # The result with delete_last_line should be shorter or equal assert len(result_deleted) <= len(result_normal) def test_pre_computed_num_input_tokens(self): """Test using pre-computed num_input_tokens parameter.""" text = "This is a test text" max_tokens = 10 num_input_tokens = 15 # Should not call the encoder when num_input_tokens is provided with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_encoder.return_value = None # Should not be called result = clip_tokens(text, max_tokens, num_input_tokens=num_input_tokens) assert result.endswith("\n...(truncated)") mock_encoder.assert_not_called() def test_pre_computed_tokens_under_limit(self): """Test pre-computed tokens under the limit.""" text = "Short text" max_tokens = 20 num_input_tokens = 5 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_encoder.return_value = None # Should not be called result = clip_tokens(text, max_tokens, num_input_tokens=num_input_tokens) assert result == text mock_encoder.assert_not_called() def test_special_characters_and_unicode(self): """Test text with special characters and Unicode content.""" text = "Special chars: @#$%^&*()_+ áéíóú 中문 🚀 emoji" max_tokens = 5 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, max_tokens) assert isinstance(result, str) assert len(result) < len(text) def test_multiline_text_handling(self): """Test handling of multiline text.""" text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5" max_tokens = 5 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, max_tokens) assert isinstance(result, str) def test_very_long_text(self): """Test with very long text.""" text = "A" * 10000 # Very long text max_tokens = 10 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 5000 # Many tokens mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, max_tokens) assert len(result) < len(text) assert result.endswith("\n...(truncated)") def test_encoder_exception_handling(self): """Test handling of encoder exceptions.""" text = "Test text" max_tokens = 10 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_encoder.side_effect = Exception("Encoder error") # Should return original text when encoder fails result = clip_tokens(text, max_tokens) assert result == text def test_zero_division_scenario(self): """Test scenario that could lead to division by zero.""" text = "Test" max_tokens = 10 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [] # Empty tokens (could cause division by zero) mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, max_tokens) # Should handle gracefully and return original text assert result == text def test_various_edge_cases(self): """Test various edge cases.""" # Single character assert clip_tokens("A", 1000) == "A" # Only whitespace text = " \n \t " with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 10 mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, 5) assert isinstance(result, str) # Text with only newlines text = "\n\n\n\n" with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 10 mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, 2, delete_last_line=True) assert isinstance(result, str) def test_parameter_combinations(self): """Test different parameter combinations.""" text = "Multi\nline\ntext\nfor\ntesting" max_tokens = 5 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 20 mock_encoder.return_value = mock_tokenizer # Test all combinations combinations = [ (True, True), # add_three_dots=True, delete_last_line=True (True, False), # add_three_dots=True, delete_last_line=False (False, True), # add_three_dots=False, delete_last_line=True (False, False), # add_three_dots=False, delete_last_line=False ] for add_dots, delete_line in combinations: result = clip_tokens(text, max_tokens, add_three_dots=add_dots, delete_last_line=delete_line) assert isinstance(result, str) if add_dots and len(result) > 0: assert result.endswith("\n...(truncated)") or result == text def test_num_output_chars_zero_scenario(self): """Test scenario where num_output_chars becomes zero or negative.""" text = "Short" max_tokens = 1 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 1000 # Many tokens for short text mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, max_tokens) # When num_output_chars is 0 or negative, should return empty string assert result == "" def test_logging_on_exception(self): """Test that exceptions are properly logged.""" text = "Test text" max_tokens = 10 # Patch the logger at the module level where it's imported with patch('pr_agent.algo.utils.get_logger') as mock_logger: mock_log_instance = MagicMock() mock_logger.return_value = mock_log_instance with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_encoder.side_effect = Exception("Test exception") result = clip_tokens(text, max_tokens) # Should log the warning mock_log_instance.warning.assert_called_once() # Should return original text assert result == text def test_factor_safety_calculation(self): """Test that the 0.9 factor (10% reduction) works correctly.""" text = "Test text that should be reduced by 10 percent for safety" max_tokens = 10 with patch.object(TokenEncoder, 'get_token_encoder') as mock_encoder: mock_tokenizer = MagicMock() mock_tokenizer.encode.return_value = [1] * 20 # 20 tokens mock_encoder.return_value = mock_tokenizer result = clip_tokens(text, max_tokens) # The result should be shorter due to the 0.9 factor # Characters per token = len(text) / 20 # Expected chars = int(0.9 * (len(text) / 20) * 10) expected_chars = int(0.9 * (len(text) / 20) * 10) # Result should be around expected_chars length (plus truncation text) if result.endswith("\n...(truncated)"): actual_content = result[:-len("\n...(truncated)")] assert len(actual_content) <= expected_chars + 5 # Some tolerance # Test the original basic functionality to ensure backward compatibility def test_clip_original_functionality(self): """Test original functionality from the existing test.""" text = "line1\nline2\nline3\nline4\nline5\nline6" max_tokens = 25 result = clip_tokens(text, max_tokens) assert result == text max_tokens = 10 result = clip_tokens(text, max_tokens) expected_results = 'line1\nline2\nline3\n\n...(truncated)' assert result == expected_results ================================================ FILE: tests/unittest/test_codecommit_client.py ================================================ from unittest.mock import MagicMock from pr_agent.git_providers.codecommit_client import CodeCommitClient class TestCodeCommitProvider: def test_get_differences(self): # Create a mock CodeCommitClient instance and codecommit_client member api = CodeCommitClient() api.boto_client = MagicMock() # Mock the response from the AWS client for get_differences method api.boto_client.get_paginator.return_value.paginate.return_value = [ { "differences": [ { "beforeBlob": { "path": "file1.py", "blobId": "291b15c3ab4219e43a5f4f9091e5a97ee9d7400b", }, "afterBlob": { "path": "file1.py", "blobId": "46ad86582da03cc34c804c24b17976571bca1eba", }, "changeType": "M", }, { "beforeBlob": {"path": "", "blobId": ""}, "afterBlob": { "path": "file2.py", "blobId": "2404c7874fcbd684d6779c1420072f088647fd79", }, "changeType": "A", }, { "beforeBlob": { "path": "file3.py", "blobId": "9af7989045ce40e9478ebb8089dfbadac19a9cde", }, "afterBlob": {"path": "", "blobId": ""}, "changeType": "D", }, { "beforeBlob": { "path": "file5.py", "blobId": "738e36eec120ef9d6393a149252698f49156d5b4", }, "afterBlob": { "path": "file6.py", "blobId": "faecdb85f7ba199df927a783b261378a1baeca85", }, "changeType": "R", }, ] } ] diffs = api.get_differences("my_test_repo", "commit1", "commit2") assert len(diffs) == 4 assert diffs[0].before_blob_path == "file1.py" assert diffs[0].before_blob_id == "291b15c3ab4219e43a5f4f9091e5a97ee9d7400b" assert diffs[0].after_blob_path == "file1.py" assert diffs[0].after_blob_id == "46ad86582da03cc34c804c24b17976571bca1eba" assert diffs[0].change_type == "M" assert diffs[1].before_blob_path == "" assert diffs[1].before_blob_id == "" assert diffs[1].after_blob_path == "file2.py" assert diffs[1].after_blob_id == "2404c7874fcbd684d6779c1420072f088647fd79" assert diffs[1].change_type == "A" assert diffs[2].before_blob_path == "file3.py" assert diffs[2].before_blob_id == "9af7989045ce40e9478ebb8089dfbadac19a9cde" assert diffs[2].after_blob_path == "" assert diffs[2].after_blob_id == "" assert diffs[2].change_type == "D" assert diffs[3].before_blob_path == "file5.py" assert diffs[3].before_blob_id == "738e36eec120ef9d6393a149252698f49156d5b4" assert diffs[3].after_blob_path == "file6.py" assert diffs[3].after_blob_id == "faecdb85f7ba199df927a783b261378a1baeca85" assert diffs[3].change_type == "R" def test_get_file(self): # Create a mock CodeCommitClient instance and codecommit_client member api = CodeCommitClient() api.boto_client = MagicMock() # Mock the response from the AWS client for get_pull_request method # def get_file(self, repo_name: str, file_path: str, sha_hash: str): api.boto_client.get_file.return_value = { "commitId": "6335d6d4496e8d50af559560997604bb03abc122", "blobId": "c172209495d7968a8fdad76469564fb708460bc1", "filePath": "requirements.txt", "fileSize": 65, "fileContent": b"boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n", } repo_name = "my_test_repo" file_path = "requirements.txt" sha_hash = "84114a356ece1e5b7637213c8e486fea7c254656" content = api.get_file(repo_name, file_path, sha_hash) assert len(content) == 65 assert content == b"boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n" assert content.decode("utf-8") == "boto3==1.28.25\ndynaconf==3.1.12\nfastapi==0.99.0\nPyGithub==1.59.*\n" def test_get_pr(self): # Create a mock CodeCommitClient instance and codecommit_client member api = CodeCommitClient() api.boto_client = MagicMock() # Mock the response from the AWS client for get_pull_request method api.boto_client.get_pull_request.return_value = { "pullRequest": { "pullRequestId": "321", "title": "My PR", "description": "My PR description", "pullRequestTargets": [ { "sourceCommit": "commit1", "sourceReference": "branch1", "destinationCommit": "commit2", "destinationReference": "branch2", "repositoryName": "my_test_repo", } ], } } pr = api.get_pr("my_test_repo", 321) assert pr.title == "My PR" assert pr.description == "My PR description" assert len(pr.targets) == 1 assert pr.targets[0].source_commit == "commit1" assert pr.targets[0].source_branch == "branch1" assert pr.targets[0].destination_commit == "commit2" assert pr.targets[0].destination_branch == "branch2" ================================================ FILE: tests/unittest/test_codecommit_provider.py ================================================ from unittest.mock import patch import pytest from pr_agent.algo.types import EDIT_TYPE, FilePatchInfo from pr_agent.git_providers.codecommit_provider import CodeCommitFile, CodeCommitProvider, PullRequestCCMimic class TestCodeCommitFile: # Test that a CodeCommitFile object is created successfully with valid parameters. # Generated by CodiumAI def test_valid_parameters(self): a_path = "path/to/file_a" a_blob_id = "12345" b_path = "path/to/file_b" b_blob_id = "67890" edit_type = EDIT_TYPE.ADDED file = CodeCommitFile(a_path, a_blob_id, b_path, b_blob_id, edit_type) assert file.a_path == a_path assert file.a_blob_id == a_blob_id assert file.b_path == b_path assert file.b_blob_id == b_blob_id assert file.edit_type == edit_type assert file.filename == b_path class TestCodeCommitProvider: def test_get_title(self): # Test that the get_title() function returns the PR title with patch.object(CodeCommitProvider, "__init__", lambda x, y: None): provider = CodeCommitProvider(None) provider.pr = PullRequestCCMimic("My Test PR Title", []) assert provider.get_title() == "My Test PR Title" def test_get_pr_id(self): # Test that the get_pr_id() function returns the correct ID with patch.object(CodeCommitProvider, "__init__", lambda x, y: None): provider = CodeCommitProvider(None) provider.repo_name = "my_test_repo" provider.pr_num = 321 assert provider.get_pr_id() == "my_test_repo/321" def test_parse_pr_url(self): # Test that the _parse_pr_url() function can extract the repo name and PR number from a CodeCommit URL url = "https://us-east-1.console.aws.amazon.com/codesuite/codecommit/repositories/my_test_repo/pull-requests/321" repo_name, pr_number = CodeCommitProvider._parse_pr_url(url) assert repo_name == "my_test_repo" assert pr_number == 321 def test_is_valid_codecommit_hostname(self): # Test the various AWS regions assert CodeCommitProvider._is_valid_codecommit_hostname("af-south-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-east-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-northeast-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-northeast-2.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-northeast-3.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-south-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-south-2.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-southeast-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-southeast-2.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-southeast-3.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ap-southeast-4.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("ca-central-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("eu-central-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("eu-central-2.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("eu-north-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("eu-south-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("eu-south-2.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("eu-west-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("eu-west-2.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("eu-west-3.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("il-central-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("me-central-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("me-south-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("sa-east-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("us-east-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("us-east-2.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("us-gov-east-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("us-gov-west-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("us-west-1.console.aws.amazon.com") assert CodeCommitProvider._is_valid_codecommit_hostname("us-west-2.console.aws.amazon.com") # Test non-AWS regions assert not CodeCommitProvider._is_valid_codecommit_hostname("no-such-region.console.aws.amazon.com") assert not CodeCommitProvider._is_valid_codecommit_hostname("console.aws.amazon.com") # Test that an error is raised when an invalid CodeCommit URL is provided to the set_pr() method of the CodeCommitProvider class. # Generated by CodiumAI def test_invalid_codecommit_url(self): provider = CodeCommitProvider() with pytest.raises(ValueError): provider.set_pr("https://example.com/codecommit/repositories/my_test_repo/pull-requests/4321") def test_get_file_extensions(self): filenames = [ "app.py", "cli.py", "composer.json", "composer.lock", "hello.py", "image1.jpg", "image2.JPG", "index.js", "provider.py", "README", "test.py", ] expected_extensions = [ ".py", ".py", ".json", ".lock", ".py", ".jpg", ".jpg", ".js", ".py", "", ".py", ] extensions = CodeCommitProvider._get_file_extensions(filenames) assert extensions == expected_extensions def test_get_language_percentages(self): extensions = [ ".py", ".py", ".json", ".lock", ".py", ".jpg", ".jpg", ".js", ".py", "", ".py", ] percentages = CodeCommitProvider._get_language_percentages(extensions) assert percentages[".py"] == 45 assert percentages[".json"] == 9 assert percentages[".lock"] == 9 assert percentages[".jpg"] == 18 assert percentages[".js"] == 9 assert percentages[""] == 9 # The _get_file_extensions function needs the "." prefix on the extension, # but the _get_language_percentages function will work with or without the "." prefix extensions = [ "txt", "py", "py", ] percentages = CodeCommitProvider._get_language_percentages(extensions) assert percentages["py"] == 67 assert percentages["txt"] == 33 # test an empty list percentages = CodeCommitProvider._get_language_percentages([]) assert percentages == {} def test_get_edit_type(self): # Test that the _get_edit_type() function can convert a CodeCommit letter to an EDIT_TYPE enum assert CodeCommitProvider._get_edit_type("A") == EDIT_TYPE.ADDED assert CodeCommitProvider._get_edit_type("D") == EDIT_TYPE.DELETED assert CodeCommitProvider._get_edit_type("M") == EDIT_TYPE.MODIFIED assert CodeCommitProvider._get_edit_type("R") == EDIT_TYPE.RENAMED assert CodeCommitProvider._get_edit_type("a") == EDIT_TYPE.ADDED assert CodeCommitProvider._get_edit_type("d") == EDIT_TYPE.DELETED assert CodeCommitProvider._get_edit_type("m") == EDIT_TYPE.MODIFIED assert CodeCommitProvider._get_edit_type("r") == EDIT_TYPE.RENAMED assert CodeCommitProvider._get_edit_type("X") is None def test_add_additional_newlines(self): # a short string to test adding double newlines input = "abc\ndef\n\n___\nghi\njkl\nmno\n\npqr\n" expect = "abc\n\ndef\n\n___\n\nghi\n\njkl\n\nmno\n\npqr\n\n" assert CodeCommitProvider._add_additional_newlines(input) == expect # a test example from a real PR input = "## PR Type:\nEnhancement\n\n___\n## PR Description:\nThis PR introduces a new feature to the script, allowing users to filter servers by name.\n\n___\n## PR Main Files Walkthrough:\n`foo`: The foo script has been updated to include a new command line option `-f` or `--filter`.\n`bar`: The bar script has been updated to list stopped servers.\n" expect = "## PR Type:\n\nEnhancement\n\n___\n\n## PR Description:\n\nThis PR introduces a new feature to the script, allowing users to filter servers by name.\n\n___\n\n## PR Main Files Walkthrough:\n\n`foo`: The foo script has been updated to include a new command line option `-f` or `--filter`.\n\n`bar`: The bar script has been updated to list stopped servers.\n\n" assert CodeCommitProvider._add_additional_newlines(input) == expect def test_remove_markdown_html(self): input = "## PR Feedback\n<details><summary>Code feedback:</summary>\nfile foo\n</summary>\n" expect = "## PR Feedback\nCode feedback:\nfile foo\n\n" assert CodeCommitProvider._remove_markdown_html(input) == expect ================================================ FILE: tests/unittest/test_config_loader_secrets.py ================================================ from unittest.mock import MagicMock, patch from pr_agent.config_loader import apply_secrets_manager_config, apply_secrets_to_config class TestConfigLoaderSecrets: def test_apply_secrets_manager_config_success(self): with patch('pr_agent.secret_providers.get_secret_provider') as mock_get_provider, \ patch('pr_agent.config_loader.apply_secrets_to_config') as mock_apply_secrets, \ patch('pr_agent.config_loader.get_settings') as mock_get_settings: # Mock secret provider mock_provider = MagicMock() mock_provider.get_all_secrets.return_value = {'openai.key': 'sk-test'} mock_get_provider.return_value = mock_provider # Mock settings settings = MagicMock() settings.get.return_value = "aws_secrets_manager" mock_get_settings.return_value = settings apply_secrets_manager_config() mock_apply_secrets.assert_called_once_with({'openai.key': 'sk-test'}) def test_apply_secrets_manager_config_no_provider(self): with patch('pr_agent.secret_providers.get_secret_provider') as mock_get_provider: mock_get_provider.return_value = None # Confirm no exception is raised apply_secrets_manager_config() def test_apply_secrets_manager_config_not_aws(self): with patch('pr_agent.secret_providers.get_secret_provider') as mock_get_provider, \ patch('pr_agent.config_loader.get_settings') as mock_get_settings: # Mock Google Cloud Storage provider mock_provider = MagicMock() mock_get_provider.return_value = mock_provider # Mock settings (Google Cloud Storage) settings = MagicMock() settings.get.return_value = "google_cloud_storage" mock_get_settings.return_value = settings # Confirm execution is skipped for non-AWS Secrets Manager apply_secrets_manager_config() # Confirm get_all_secrets is not called assert not hasattr(mock_provider, 'get_all_secrets') or \ not mock_provider.get_all_secrets.called def test_apply_secrets_to_config_nested_keys(self): with patch('pr_agent.config_loader.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = None # No existing value settings.set = MagicMock() mock_get_settings.return_value = settings secrets = { 'openai.key': 'sk-test', 'github.webhook_secret': 'webhook-secret' } apply_secrets_to_config(secrets) # Confirm settings are applied correctly settings.set.assert_any_call('OPENAI.KEY', 'sk-test') settings.set.assert_any_call('GITHUB.WEBHOOK_SECRET', 'webhook-secret') def test_apply_secrets_to_config_existing_value_preserved(self): with patch('pr_agent.config_loader.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = "existing-value" # Existing value present settings.set = MagicMock() mock_get_settings.return_value = settings secrets = {'openai.key': 'sk-test'} apply_secrets_to_config(secrets) # Confirm settings are not overridden when existing value present settings.set.assert_not_called() def test_apply_secrets_to_config_single_key(self): with patch('pr_agent.config_loader.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = None settings.set = MagicMock() mock_get_settings.return_value = settings secrets = {'simple_key': 'simple_value'} apply_secrets_to_config(secrets) # Confirm non-dot notation keys are ignored settings.set.assert_not_called() def test_apply_secrets_to_config_multiple_dots(self): with patch('pr_agent.config_loader.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = None settings.set = MagicMock() mock_get_settings.return_value = settings secrets = {'section.subsection.key': 'value'} apply_secrets_to_config(secrets) # Confirm keys with multiple dots are ignored settings.set.assert_not_called() def test_apply_secrets_manager_config_exception_handling(self): with patch('pr_agent.secret_providers.get_secret_provider') as mock_get_provider: mock_get_provider.side_effect = Exception("Provider error") # Confirm processing continues even when exception occurs apply_secrets_manager_config() # Confirm no exception is raised ================================================ FILE: tests/unittest/test_convert_to_markdown.py ================================================ # Generated by CodiumAI import textwrap from unittest.mock import Mock from pr_agent.algo.utils import PRReviewHeader, convert_to_markdown_v2 from pr_agent.tools.pr_description import insert_br_after_x_chars """ Code Analysis Objective: The objective of the 'convert_to_markdown' function is to convert a dictionary of data into a markdown-formatted text. The function takes in a dictionary as input and recursively iterates through its keys and values to generate the markdown text. Inputs: - A dictionary of data containing information about a pull request. Flow: - Initialize an empty string variable 'markdown_text'. - Create a dictionary 'emojis' containing emojis for each key in the input dictionary. - Iterate through the input dictionary: - If the value is empty, continue to the next iteration. - If the value is a dictionary, recursively call the 'convert_to_markdown' function with the value as input and append the returned markdown text to 'markdown_text'. - If the value is a list: - If the key is 'code suggestions', add an additional line break to 'markdown_text'. - Get the corresponding emoji for the key from the 'emojis' dictionary. If no emoji is found, use a dash. - Append the emoji and key to 'markdown_text'. - Iterate through the items in the list: - If the item is a dictionary and the key is 'code suggestions', call the 'parse_code_suggestion' function with the item as input and append the returned markdown text to 'markdown_text'. - If the item is not empty, append it to 'markdown_text'. - If the value is not 'n/a', get the corresponding emoji for the key from the 'emojis' dictionary. If no emoji is found, use a dash. Append the emoji, key, and value to 'markdown_text'. - Return 'markdown_text'. Outputs: - A markdown-formatted string containing the information from the input dictionary. Additional aspects: - The function uses recursion to handle nested dictionaries. - The 'parse_code_suggestion' function is called for items in the 'code suggestions' list. - The function uses emojis to add visual cues to the markdown text. """ class TestConvertToMarkdown: # Tests that the function works correctly with a simple dictionary input def test_simple_dictionary_input(self): input_data = {'review': { 'estimated_effort_to_review_[1-5]': '1, because the changes are minimal and straightforward, focusing on a single functionality addition.\n', 'relevant_tests': 'No\n', 'possible_issues': 'No\n', 'security_concerns': 'No\n'}} expected_output = textwrap.dedent(f"""\ {PRReviewHeader.REGULAR.value} 🔍 Here are some key observations to aid the review process: <table> <tr><td>⏱️ <strong>Estimated effort to review</strong>: 1 🔵⚪⚪⚪⚪</td></tr> <tr><td>🧪 <strong>No relevant tests</strong></td></tr> <tr><td> <strong>Possible issues</strong>: No </td></tr> <tr><td>🔒 <strong>No security concerns identified</strong></td></tr> </table> """) assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() def test_simple_dictionary_input_without_gfm_supported(self): input_data = {'review': { 'estimated_effort_to_review_[1-5]': '1, because the changes are minimal and straightforward, focusing on a single functionality addition.\n', 'relevant_tests': 'No\n', 'possible_issues': 'No\n', 'security_concerns': 'No\n'}} expected_output = textwrap.dedent("""\ ## PR Reviewer Guide 🔍 Here are some key observations to aid the review process: ### ⏱️ Estimated effort to review: 1 🔵⚪⚪⚪⚪ ### 🧪 No relevant tests ### Possible issues: No ### 🔒 No security concerns identified """) assert convert_to_markdown_v2(input_data, gfm_supported=False).strip() == expected_output.strip() def test_key_issues_to_review(self): input_data = {'review': { 'key_issues_to_review': [ { 'relevant_file': 'src/utils.py', 'issue_header': 'Code Smell', 'issue_content': 'The function is too long and complex.', 'start_line': 30, 'end_line': 50, } ] }} mock_git_provider = Mock() reference_link = 'https://github.com/qodo/pr-agent/pull/1/files#diff-hashvalue-R174' mock_git_provider.get_line_link.return_value = reference_link expected_output = textwrap.dedent(f"""\ ## PR Reviewer Guide 🔍 Here are some key observations to aid the review process: <table> <tr><td>⚡ <strong>Recommended focus areas for review</strong><br><br> <a href='{reference_link}'><strong>Code Smell</strong></a><br>The function is too long and complex. </td></tr> </table> """) assert convert_to_markdown_v2(input_data, git_provider=mock_git_provider).strip() == expected_output.strip() mock_git_provider.get_line_link.assert_called_with('src/utils.py', 30, 50) def test_ticket_compliance(self): input_data = {'review': { 'ticket_compliance_check': [ { 'ticket_url': 'https://example.com/ticket/123', 'ticket_requirements': '- Requirement 1\n- Requirement 2\n', 'fully_compliant_requirements': '- Requirement 1\n- Requirement 2\n', 'not_compliant_requirements': '', 'requires_further_human_verification': '', } ] }} expected_output = textwrap.dedent("""\ ## PR Reviewer Guide 🔍 Here are some key observations to aid the review process: <table> <tr><td> **🎫 Ticket compliance analysis ✅** **[123](https://example.com/ticket/123) - Fully compliant** Compliant requirements: - Requirement 1 - Requirement 2 </td></tr> </table> """) assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() def test_can_be_split(self): input_data = {'review': { 'can_be_split': [ { 'relevant_files': [ 'src/file1.py', 'src/file2.py' ], 'title': 'Refactoring', }, { 'relevant_files': [ 'src/file3.py' ], 'title': 'Bug Fix', } ] } } expected_output = textwrap.dedent("""\ ## PR Reviewer Guide 🔍 Here are some key observations to aid the review process: <table> <tr><td>🔀 <strong>Multiple PR themes</strong><br><br> <details><summary> Sub-PR theme: <b>Refactoring</b></summary> ___ Relevant files: - src/file1.py - src/file2.py ___ </details> <details><summary> Sub-PR theme: <b>Bug Fix</b></summary> ___ Relevant files: - src/file3.py ___ </details> </td></tr> </table> """) assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() def test_contribution_time_cost_estimate(self): input_data = { 'review': { 'contribution_time_cost_estimate': { 'best_case': '1h', 'average_case': '2h', 'worst_case': '30m', } } } expected_output = textwrap.dedent(f""" {PRReviewHeader.REGULAR.value} 🔍 Here are some key observations to aid the review process: <table> <tr><td>⏳ <strong>Contribution time estimate</strong> (best, average, worst case): 1h | 2h | 30 minutes</td></tr> </table> """) assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() # Non-GFM branch expected_output_no_gfm = textwrap.dedent(f""" {PRReviewHeader.REGULAR.value} 🔍 Here are some key observations to aid the review process: ### ⏳ Contribution time estimate (best, average, worst case): 1h | 2h | 30 minutes """) assert convert_to_markdown_v2(input_data, gfm_supported=False).strip() == expected_output_no_gfm.strip() # Tests that the function works correctly with an empty dictionary input def test_empty_dictionary_input(self): input_data = {} expected_output = '' assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() def test_dictionary_with_empty_dictionaries(self): input_data = {'review': {}} expected_output = '' assert convert_to_markdown_v2(input_data).strip() == expected_output.strip() class TestBR: def test_br1(self): file_change_description = '- Imported `FilePatchInfo` and `EDIT_TYPE` from `pr_agent.algo.types` instead of `pr_agent.git_providers.git_provider`.' file_change_description_br = insert_br_after_x_chars(file_change_description) expected_output = ('<ul><li>Imported <code>FilePatchInfo</code> and <code>EDIT_TYPE</code> from ' '<code>pr_agent.algo.types</code> instead <br>of ' '<code>pr_agent.git_providers.git_provider</code>.</ul>') assert file_change_description_br == expected_output # print("-----") # print(file_change_description_br) def test_br2(self): file_change_description = ( '- Created a - new -class `ColorPaletteResourcesCollection ColorPaletteResourcesCollection ' 'ColorPaletteResourcesCollection ColorPaletteResourcesCollection`') file_change_description_br = insert_br_after_x_chars(file_change_description) expected_output = ('<ul><li>Created a - new -class <code>ColorPaletteResourcesCollection </code><br><code>' 'ColorPaletteResourcesCollection ColorPaletteResourcesCollection ' '</code><br><code>ColorPaletteResourcesCollection</code></ul>') assert file_change_description_br == expected_output # print("-----") # print(file_change_description_br) def test_br3(self): file_change_description = 'Created a new class `ColorPaletteResourcesCollection` which extends `AvaloniaDictionary<ThemeVariant, ColorPaletteResources>` and implements aaa' file_change_description_br = insert_br_after_x_chars(file_change_description) assert file_change_description_br == ('Created a new class <code>ColorPaletteResourcesCollection</code> which ' 'extends <br><code>AvaloniaDictionary<ThemeVariant, ColorPaletteResources>' '</code> and implements <br>aaa') # print("-----") # print(file_change_description_br) ================================================ FILE: tests/unittest/test_delete_hunks.py ================================================ # Generated by CodiumAI from pr_agent.algo.git_patch_processing import omit_deletion_hunks """ Code Analysis Objective: The objective of the "omit_deletion_hunks" function is to remove deletion hunks from a patch file and return only the added lines. Inputs: - "patch_lines": a list of strings representing the lines of a patch file. Flow: - Initialize empty lists "temp_hunk" and "added_patched", and boolean variables "add_hunk" and "inside_hunk". - Compile a regular expression pattern to match hunk headers. - Iterate through each line in "patch_lines". - If the line starts with "@@", match the line with the hunk header pattern, finish the previous hunk if necessary, and append the line to "temp_hunk". - If the line does not start with "@@", append the line to "temp_hunk", check if it is an added line, and set "add_hunk" to True if it is. - If the function reaches the end of "patch_lines" and there is an unfinished hunk with added lines, append it to "added_patched". - Join the lines in "added_patched" with newline characters and return the resulting string. Outputs: - A string representing the added lines in the patch file. Additional aspects: - The function only considers hunks with added lines and ignores hunks with deleted lines. - The function assumes that the input patch file is well-formed and follows the unified diff format. """ class TestOmitDeletionHunks: # Tests that the function correctly handles a simple patch containing only additions def test_simple_patch_additions(self): patch_lines = ['@@ -1,0 +1,1 @@\n', '+added line\n'] expected_output = '@@ -1,0 +1,1 @@\n\n+added line\n' assert omit_deletion_hunks(patch_lines) == expected_output # Tests that the function correctly omits deletion hunks and concatenates multiple hunks in a patch. def test_patch_multiple_hunks(self): patch_lines = ['@@ -1,0 +1,1 @@\n', '-deleted line', '+added line\n', '@@ -2,0 +3,1 @@\n', '-deleted line\n', '-another deleted line\n'] expected_output = '@@ -1,0 +1,1 @@\n\n-deleted line\n+added line\n' assert omit_deletion_hunks(patch_lines) == expected_output # Tests that the function correctly omits deletion lines from the patch when there are no additions or context # lines. def test_patch_only_deletions(self): patch_lines = ['@@ -1,1 +1,0 @@\n', '-deleted line\n'] expected_output = '' assert omit_deletion_hunks(patch_lines) == expected_output # Additional deletion lines patch_lines = ['@@ -1,1 +1,0 @@\n', '-deleted line\n', '-another deleted line\n'] expected_output = '' assert omit_deletion_hunks(patch_lines) == expected_output # Additional context lines patch_lines = ['@@ -1,1 +1,0 @@\n', '-deleted line\n', '-another deleted line\n', 'context line 1\n', 'context line 2\n', 'context line 3\n'] expected_output = '' assert omit_deletion_hunks(patch_lines) == expected_output # Tests that the function correctly handles an empty patch def test_empty_patch(self): patch_lines = [] expected_output = '' assert omit_deletion_hunks(patch_lines) == expected_output # Tests that the function correctly handles a patch containing only one hunk def test_patch_one_hunk(self): patch_lines = ['@@ -1,0 +1,1 @@\n', '+added line\n'] expected_output = '@@ -1,0 +1,1 @@\n\n+added line\n' assert omit_deletion_hunks(patch_lines) == expected_output # Tests that the function correctly handles a patch containing only deletions and no additions def test_patch_deletions_no_additions(self): patch_lines = ['@@ -1,1 +1,0 @@\n', '-deleted line\n'] expected_output = '' assert omit_deletion_hunks(patch_lines) == expected_output ================================================ FILE: tests/unittest/test_extend_patch.py ================================================ import pytest from pr_agent.algo.git_patch_processing import extend_patch from pr_agent.algo.pr_processing import pr_generate_extended_diff from pr_agent.algo.token_handler import TokenHandler from pr_agent.algo.utils import load_large_diff from pr_agent.config_loader import get_settings get_settings(use_context=False).set("CONFIG.CLI_MODE", True) get_settings(use_context=False).config.allow_dynamic_context = False class TestExtendPatch: # Tests that the function works correctly with valid input def test_happy_path(self): original_file_str = 'line1\nline2\nline3\nline4\nline5' patch_str = '@@ -2,2 +2,2 @@ init()\n-line2\n+new_line2\n line3' num_lines = 1 expected_output = '\n@@ -1,4 +1,4 @@ init()\n line1\n-line2\n+new_line2\n line3\n line4' actual_output = extend_patch(original_file_str, patch_str, patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) assert actual_output == expected_output # Tests that the function returns an empty string when patch_str is empty def test_empty_patch(self): original_file_str = 'line1\nline2\nline3\nline4\nline5' patch_str = '' num_lines = 1 expected_output = '' assert extend_patch(original_file_str, patch_str, patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) == expected_output # Tests that the function returns the original patch when num_lines is 0 def test_zero_num_lines(self): original_file_str = 'line1\nline2\nline3\nline4\nline5' patch_str = '@@ -2,2 +2,2 @@ init()\n-line2\n+new_line2\nline3' num_lines = 0 assert extend_patch(original_file_str, patch_str, patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) == patch_str # Tests that the function returns the original patch when patch_str contains no hunks def test_no_hunks(self): original_file_str = 'line1\nline2\nline3\nline4\nline5' patch_str = 'no hunks here' num_lines = 1 expected_output = 'no hunks here' assert extend_patch(original_file_str, patch_str, num_lines) == expected_output # Tests that the function extends a patch with a single hunk correctly def test_single_hunk(self): original_file_str = 'line1\nline2\nline3\nline4\nline5' patch_str = '@@ -2,3 +2,3 @@ init()\n-line2\n+new_line2\n line3\n line4' for num_lines in [1, 2, 3]: # check that even if we are over the number of lines in the file, the function still works expected_output = '\n@@ -1,5 +1,5 @@ init()\n line1\n-line2\n+new_line2\n line3\n line4\n line5' actual_output = extend_patch(original_file_str, patch_str, patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) assert actual_output == expected_output # Tests the functionality of extending a patch with multiple hunks. def test_multiple_hunks(self): original_file_str = 'line1\nline2\nline3\nline4\nline5\nline6' patch_str = '@@ -2,3 +2,3 @@ init()\n-line2\n+new_line2\n line3\n line4\n@@ -4,1 +4,1 @@ init2()\n-line4\n+new_line4' # noqa: E501 num_lines = 1 original_allow_dynamic_context = get_settings(use_context=False).config.allow_dynamic_context get_settings(use_context=False).config.allow_dynamic_context = False expected_output = '\n@@ -1,5 +1,5 @@ init()\n line1\n-line2\n+new_line2\n line3\n line4\n line5\n\n@@ -3,3 +3,3 @@ init2()\n line3\n-line4\n+new_line4\n line5' # noqa: E501 actual_output = extend_patch(original_file_str, patch_str, patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) assert actual_output == expected_output get_settings(use_context=False).config.allow_dynamic_context = True expected_output = '\n@@ -1,5 +1,5 @@ init()\n line1\n-line2\n+new_line2\n line3\n line4\n line5\n\n@@ -3,3 +3,3 @@ init2()\n line3\n-line4\n+new_line4\n line5' # noqa: E501 actual_output = extend_patch(original_file_str, patch_str, patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines) assert actual_output == expected_output get_settings(use_context=False).config.allow_dynamic_context = original_allow_dynamic_context def test_dynamic_context(self): get_settings(use_context=False).config.max_extra_lines_before_dynamic_context = 10 original_file_str = "def foo():" for i in range(9): original_file_str += f"\n line({i})" patch_str ="@@ -10,1 +10,1 @@ def foo():\n- line(8)\n+ new_line(8)" new_file_str = "\n".join(original_file_str.splitlines()[:-1] + [" new_line(8)"]) num_lines=1 get_settings(use_context=False).config.allow_dynamic_context = True actual_output = extend_patch(original_file_str, patch_str, patch_extra_lines_before=num_lines, patch_extra_lines_after=num_lines, new_file_str=new_file_str) expected_output='\n@@ -1,10 +1,10 @@ \n def foo():\n line(0)\n line(1)\n line(2)\n line(3)\n line(4)\n line(5)\n line(6)\n line(7)\n- line(8)\n+ new_line(8)' assert actual_output == expected_output get_settings(use_context=False).config.allow_dynamic_context = False actual_output2 = extend_patch(original_file_str, patch_str, patch_extra_lines_before=1, patch_extra_lines_after=1) expected_output_no_dynamic_context = '\n@@ -9,2 +9,2 @@ def foo():\n line(7)\n- line(8)\n+ new_line(8)' assert actual_output2 == expected_output_no_dynamic_context get_settings(use_context=False).config.allow_dynamic_context = False actual_output3 = extend_patch(original_file_str, patch_str, patch_extra_lines_before=3, patch_extra_lines_after=3) expected_output_no_dynamic_context = '\n@@ -7,4 +7,4 @@ def foo():\n line(5)\n line(6)\n line(7)\n- line(8)\n+ new_line(8)' assert actual_output3 == expected_output_no_dynamic_context class TestExtendedPatchMoreLines: class File: def __init__(self, base_file, patch, head_file, filename, ai_file_summary=None): self.base_file = base_file self.patch = patch self.head_file = head_file self.filename = filename self.ai_file_summary = ai_file_summary @pytest.fixture def token_handler(self): # Create a TokenHandler instance with dummy data th = TokenHandler(system="System prompt", user="User prompt") th.prompt_tokens = 100 return th @pytest.fixture def pr_languages(self): # Create a list of languages with files containing base_file and patch data return [ { 'files': [ self.File(base_file="line000\nline00\nline0\nline1\noriginal content\nline2\nline3\nline4\nline5\nline6\nline7\nline8\nline9\nline10", patch="@@ -5,5 +5,5 @@\n-original content\n+modified content\n line2\n line3\n line4\n line5", head_file="line000\nline00\nline0\nline1\nmodified content\nline2\nline3\nline4\nline5\nline6\nline7\nline8\nline9\nline10", filename="file1"), self.File(base_file="original content\nline2\nline3\nline4\nline5\nline6\nline7\nline8\nline9\nline10", patch="@@ -6,5 +6,5 @@\nline6\nline7\nline8\n-line9\n+modified line9\nline10", head_file="original content\nline2\nline3\nline4\nline5\nline6\nline7\nline8\nmodified line9\nline10", filename="file2") ] } ] def test_extend_patches_with_extra_lines(self, token_handler, pr_languages): patches_extended_no_extra_lines, total_tokens, patches_extended_tokens = pr_generate_extended_diff( pr_languages, token_handler, add_line_numbers_to_hunks=False, patch_extra_lines_before=0, patch_extra_lines_after=0 ) # Check that with no extra lines, the patches are the same as the original patches p0 = patches_extended_no_extra_lines[0].strip() p1 = patches_extended_no_extra_lines[1].strip() assert p0 == "## File: 'file1'\n\n" + pr_languages[0]['files'][0].patch.strip() assert p1 == "## File: 'file2'\n\n" + pr_languages[0]['files'][1].patch.strip() patches_extended_with_extra_lines, total_tokens, patches_extended_tokens = pr_generate_extended_diff( pr_languages, token_handler, add_line_numbers_to_hunks=False, patch_extra_lines_before=2, patch_extra_lines_after=1 ) p0_extended = patches_extended_with_extra_lines[0].strip() assert p0_extended == "## File: 'file1'\n\n@@ -3,8 +3,8 @@ \n line0\n line1\n-original content\n+modified content\n line2\n line3\n line4\n line5\n line6" class TestLoadLargeDiff: def test_no_newline(self): patch = load_large_diff("test.py", """\ old content 1 some new content another line """, """ old content 1 old content 2""") patch_expected="""\ --- +++ @@ -1,3 +1,3 @@ - old content 1 - old content 2 + some new content + another line """ assert patch == patch_expected def test_empty_inputs(self): assert load_large_diff("test.py", "", "") == "" assert load_large_diff("test.py", None, None) == "" assert (load_large_diff("test.py", "content\n", "") == '--- \n+++ \n@@ -1 +1 @@\n-\n+content\n') ================================================ FILE: tests/unittest/test_extract_issue_from_branch.py ================================================ import pytest from pr_agent.tools.ticket_pr_compliance_check import extract_ticket_links_from_branch_name class TestExtractTicketsLinkFromBranchName: """Unit tests for branch-name issue extraction (option A: number at start of segment).""" def test_feature_slash_number_suffix(self): """feature/1-test-issue -> issue #1""" result = extract_ticket_links_from_branch_name( "feature/1-test-issue", "org/repo", "https://github.com" ) assert result == ["https://github.com/org/repo/issues/1"] def test_fix_slash_number_suffix(self): """fix/123-bug -> issue #123""" result = extract_ticket_links_from_branch_name( "fix/123-bug", "owner/repo", "https://github.com" ) assert result == ["https://github.com/owner/repo/issues/123"] def test_number_at_start_no_slash(self): """123-fix -> issue #123""" result = extract_ticket_links_from_branch_name( "123-fix", "org/repo", "https://github.com" ) assert result == ["https://github.com/org/repo/issues/123"] def test_empty_branch_returns_empty(self): """Empty branch name -> []""" result = extract_ticket_links_from_branch_name("", "org/repo") assert result == [] def test_none_branch_returns_empty(self): """None branch name -> []""" result = extract_ticket_links_from_branch_name(None, "org/repo") assert result == [] def test_no_digits_in_segment_returns_empty(self): """feature/no-issue -> []""" result = extract_ticket_links_from_branch_name( "feature/no-issue", "org/repo", "https://github.com" ) assert result == [] def test_base_url_no_trailing_slash(self): """base_url_html without trailing slash is normalized""" result = extract_ticket_links_from_branch_name( "feature/1-test", "org/repo", "https://github.com/" ) assert result == ["https://github.com/org/repo/issues/1"] def test_disable_via_config_returns_empty(self, monkeypatch): """When extract_issue_from_branch is False, return []""" fake_settings = type("Settings", (), {})() fake_settings.get = lambda key, default=None: ( False if key in ("extract_issue_from_branch", "config.extract_issue_from_branch") else ( "" if key in ("branch_issue_regex", "config.branch_issue_regex") else default ) ) import pr_agent.tools.ticket_pr_compliance_check as m monkeypatch.setattr(m, "get_settings", lambda: fake_settings) result = extract_ticket_links_from_branch_name( "feature/1-test", "org/repo", "https://github.com" ) assert result == [] def test_invalid_custom_regex_returns_empty(self, monkeypatch): """When branch_issue_regex is invalid, log and return []""" fake_settings = type("Settings", (), {})() fake_settings.get = lambda key, default=None: ( True if key in ("extract_issue_from_branch", "config.extract_issue_from_branch") else ( "[" if key in ("branch_issue_regex", "config.branch_issue_regex") else default ) ) import pr_agent.tools.ticket_pr_compliance_check as m monkeypatch.setattr(m, "get_settings", lambda: fake_settings) result = extract_ticket_links_from_branch_name( "feature/1-test", "org/repo", "https://github.com" ) assert result == [] def test_custom_regex_without_capturing_group_falls_back_to_default(self, monkeypatch): """When branch_issue_regex has no capturing group, fall back to default pattern (no crash).""" fake_settings = type("Settings", (), {})() fake_settings.get = lambda key, default=None: ( True if key in ("extract_issue_from_branch", "config.extract_issue_from_branch") else ( r"\d+" if key in ("branch_issue_regex", "config.branch_issue_regex") else default ) ) import pr_agent.tools.ticket_pr_compliance_check as m monkeypatch.setattr(m, "get_settings", lambda: fake_settings) result = extract_ticket_links_from_branch_name( "feature/1-test", "org/repo", "https://github.com" ) assert result == ["https://github.com/org/repo/issues/1"] def test_empty_repo_path_returns_empty(self): """Empty repo_path -> [] (guard in function)""" result = extract_ticket_links_from_branch_name("feature/1-test", "", "https://github.com") assert result == [] def test_multiple_matches_deduplicated(self): """Branch with multiple segments with numbers yields unique issue URLs""" result = extract_ticket_links_from_branch_name( "feature/1-test/2-other", "org/repo", "https://github.com" ) assert set(result) == { "https://github.com/org/repo/issues/1", "https://github.com/org/repo/issues/2", } ================================================ FILE: tests/unittest/test_fetching_sub_issues.py ================================================ # Currently doing API calls - wrong ! # import unittest # import asyncio # from unittest.mock import AsyncMock, patch # from pr_agent.tools.ticket_pr_compliance_check import extract_tickets, extract_and_cache_pr_tickets # from pr_agent.git_providers.github_provider import GithubProvider # # # class TestTicketCompliance(unittest.TestCase): # # @patch.object(GithubProvider, 'get_user_description', return_value="Fixes #1 and relates to #2") # @patch.object(GithubProvider, '_parse_issue_url', side_effect=lambda url: ("WonOfAKind/KimchiBot", int(url.split('#')[-1]))) # @patch.object(GithubProvider, 'repo_obj') # async def test_extract_tickets(self, mock_repo, mock_parse_issue_url, mock_user_desc): # """ # Test extract_tickets() to ensure it extracts tickets correctly # and fetches their content. # """ # github_provider = GithubProvider() # github_provider.repo = "WonOfAKind/KimchiBot" # github_provider.base_url_html = "https://github.com" # # # Mock issue retrieval # mock_issue = AsyncMock() # mock_issue.number = 1 # mock_issue.title = "Sample Issue" # mock_issue.body = "This is a test issue body." # mock_issue.labels = ["bug", "high priority"] # # # Mock repo object # mock_repo.get_issue.return_value = mock_issue # # tickets = await extract_tickets(github_provider) # # # Verify tickets were extracted correctly # self.assertIsInstance(tickets, list) # self.assertGreater(len(tickets), 0, "Expected at least one ticket!") # # # Verify ticket structure # first_ticket = tickets[0] # self.assertIn("ticket_id", first_ticket) # self.assertIn("ticket_url", first_ticket) # self.assertIn("title", first_ticket) # self.assertIn("body", first_ticket) # self.assertIn("labels", first_ticket) # # print("\n Test Passed: extract_tickets() successfully retrieved ticket info!") # # @patch.object(GithubProvider, 'get_user_description', return_value="Fixes #1 and relates to #2") # @patch.object(GithubProvider, '_parse_issue_url', side_effect=lambda url: ("WonOfAKind/KimchiBot", int(url.split('#')[-1]))) # @patch.object(GithubProvider, 'repo_obj') # async def test_extract_and_cache_pr_tickets(self, mock_repo, mock_parse_issue_url, mock_user_desc): # """ # Test extract_and_cache_pr_tickets() to ensure tickets are extracted and cached correctly. # """ # github_provider = GithubProvider() # github_provider.repo = "WonOfAKind/KimchiBot" # github_provider.base_url_html = "https://github.com" # # vars = {} # Simulate the dictionary to store results # # # Mock issue retrieval # mock_issue = AsyncMock() # mock_issue.number = 1 # mock_issue.title = "Sample Issue" # mock_issue.body = "This is a test issue body." # mock_issue.labels = ["bug", "high priority"] # # # Mock repo object # mock_repo.get_issue.return_value = mock_issue # # # Run function # await extract_and_cache_pr_tickets(github_provider, vars) # # # Ensure tickets are cached # self.assertIn("related_tickets", vars) # self.assertIsInstance(vars["related_tickets"], list) # self.assertGreater(len(vars["related_tickets"]), 0, "Expected at least one cached ticket!") # # print("\n Test Passed: extract_and_cache_pr_tickets() successfully cached ticket data!") # # def test_fetch_sub_issues(self): # """ # Test fetch_sub_issues() to ensure sub-issues are correctly retrieved. # """ # github_provider = GithubProvider() # issue_url = "https://github.com/WonOfAKind/KimchiBot/issues/1" # Known issue with sub-issues # result = github_provider.fetch_sub_issues(issue_url) # # print("Fetched sub-issues:", result) # # self.assertIsInstance(result, set) # Ensure result is a set # self.assertGreater(len(result), 0, "Expected at least one sub-issue but found none!") # # print("\n Test Passed: fetch_sub_issues() retrieved sub-issues correctly!") # # def test_fetch_sub_issues_with_no_results(self): # """ # Test fetch_sub_issues() to ensure an empty set is returned for an issue with no sub-issues. # """ # github_provider = GithubProvider() # issue_url = "https://github.com/qodo-ai/pr-agent/issues/1499" # Likely non-existent issue # result = github_provider.fetch_sub_issues(issue_url) # # print("Fetched sub-issues for non-existent issue:", result) # # self.assertIsInstance(result, set) # Ensure result is a set # self.assertEqual(len(result), 0, "Expected no sub-issues but some were found!") # # print("\n Test Passed: fetch_sub_issues_with_no_results() correctly returned an empty set!") # # # if __name__ == "__main__": # asyncio.run(unittest.main()) # # # # # ================================================ FILE: tests/unittest/test_file_filter.py ================================================ from pr_agent.algo.file_filter import filter_ignored from pr_agent.config_loader import global_settings class TestIgnoreFilter: def test_no_ignores(self): """ Test no files are ignored when no patterns are specified. """ files = [ type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})(), type('', (object,), {'filename': 'file3.cpp'})(), type('', (object,), {'filename': 'file4.py'})(), type('', (object,), {'filename': 'file5.py'})() ] assert filter_ignored(files) == files, "Expected all files to be returned when no ignore patterns are given." def test_glob_ignores(self, monkeypatch): """ Test files are ignored when glob patterns are specified. """ monkeypatch.setattr(global_settings.ignore, 'glob', ['*.py']) files = [ type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})(), type('', (object,), {'filename': 'file3.cpp'})(), type('', (object,), {'filename': 'file4.py'})(), type('', (object,), {'filename': 'file5.py'})() ] expected = [ files[1], files[2] ] filtered_files = filter_ignored(files) assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." def test_regex_ignores(self, monkeypatch): """ Test files are ignored when regex patterns are specified. """ monkeypatch.setattr(global_settings.ignore, 'regex', ['^file[2-4]\..*$']) files = [ type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})(), type('', (object,), {'filename': 'file3.cpp'})(), type('', (object,), {'filename': 'file4.py'})(), type('', (object,), {'filename': 'file5.py'})() ] expected = [ files[0], files[4] ] filtered_files = filter_ignored(files) assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." def test_invalid_regex(self, monkeypatch): """ Test invalid patterns are quietly ignored. """ monkeypatch.setattr(global_settings.ignore, 'regex', ['(((||', '^file[2-4]\..*$']) files = [ type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})(), type('', (object,), {'filename': 'file3.cpp'})(), type('', (object,), {'filename': 'file4.py'})(), type('', (object,), {'filename': 'file5.py'})() ] expected = [ files[0], files[4] ] filtered_files = filter_ignored(files) assert filtered_files == expected, f"Expected {[file.filename for file in expected]}, but got {[file.filename for file in filtered_files]}." def test_language_framework_ignores(self, monkeypatch): """ Test files are ignored based on language/framework mapping (e.g., protobuf). """ monkeypatch.setattr(global_settings.config, 'ignore_language_framework', ['protobuf', 'go_gen']) files = [ type('', (object,), {'filename': 'main.go'})(), type('', (object,), {'filename': 'dir1/service.pb.go'})(), type('', (object,), {'filename': 'dir1/dir/data_pb2.py'})(), type('', (object,), {'filename': 'file.py'})(), type('', (object,), {'filename': 'dir2/file_gen.go'})(), type('', (object,), {'filename': 'file.generated.go'})() ] expected = [ files[0], files[3] ] filtered = filter_ignored(files) assert filtered == expected, ( f"Expected {[f.filename for f in expected]}, " f"but got {[f.filename for f in filtered]}" ) def test_skip_invalid_ignore_language_framework(self, monkeypatch): """ Test skipping of generated code filtering when ignore_language_framework is not a list """ monkeypatch.setattr(global_settings.config, 'ignore_language_framework', 'protobuf') files = [ type('', (object,), {'filename': 'main.go'})(), type('', (object,), {'filename': 'file.py'})(), type('', (object,), {'filename': 'dir1/service.pb.go'})(), type('', (object,), {'filename': 'file_pb2.py'})() ] expected = [ files[0], files[1], files[2], files[3] ] filtered = filter_ignored(files) assert filtered == expected, ( f"Expected {[f.filename for f in expected]}, " f"but got {[f.filename for f in filtered]}" ) ================================================ FILE: tests/unittest/test_find_line_number_of_relevant_line_in_file.py ================================================ # Generated by CodiumAI from pr_agent.algo.types import FilePatchInfo from pr_agent.algo.utils import find_line_number_of_relevant_line_in_file class TestFindLineNumberOfRelevantLineInFile: # Tests that the function returns the correct line number and absolute position when the relevant line is found in the patch def test_relevant_line_found_in_patch(self): diff_files = [ FilePatchInfo(base_file='file1', head_file='file1', patch='@@ -1,1 +1,2 @@\n-line1\n+line2\n+relevant_line\n', filename='file1') ] relevant_file = 'file1' relevant_line_in_file = 'relevant_line' expected = (3, 2) # (position in patch, absolute_position in new file) assert find_line_number_of_relevant_line_in_file(diff_files, relevant_file, relevant_line_in_file) == expected # Tests that the function returns the correct line number and absolute position when a similar line is found using difflib def test_similar_line_found_using_difflib(self): diff_files = [ FilePatchInfo(base_file='file1', head_file='file1', patch='@@ -1,1 +1,2 @@\n-line1\n+relevant_line in file similar match\n', filename='file1') ] relevant_file = 'file1' relevant_line_in_file = '+relevant_line in file similar match ' # note the space at the end. This is to simulate a similar line found using difflib expected = (2, 1) assert find_line_number_of_relevant_line_in_file(diff_files, relevant_file, relevant_line_in_file) == expected # Tests that the function returns (-1, -1) when the relevant line is not found in the patch and no similar line is found using difflib def test_relevant_line_not_found(self): diff_files = [ FilePatchInfo(base_file='file1', head_file='file1', patch='@@ -1,1 +1,2 @@\n-line1\n+relevant_line\n', filename='file1') ] relevant_file = 'file1' relevant_line_in_file = 'not_found' expected = (-1, -1) assert find_line_number_of_relevant_line_in_file(diff_files, relevant_file, relevant_line_in_file) == expected # Tests that the function returns (-1, -1) when the relevant file is not found in any of the patches def test_relevant_file_not_found(self): diff_files = [ FilePatchInfo(base_file='file1', head_file='file1', patch='@@ -1,1 +1,2 @@\n-line1\n+relevant_line\n', filename='file2') ] relevant_file = 'file1' relevant_line_in_file = 'relevant_line' expected = (-1, -1) assert find_line_number_of_relevant_line_in_file(diff_files, relevant_file, relevant_line_in_file) == expected # Tests that the function returns (-1, -1) when the relevant_line_in_file is an empty string def test_empty_relevant_line(self): diff_files = [ FilePatchInfo(base_file='file1', head_file='file1', patch='@@ -1,1 +1,2 @@\n-line1\n+relevant_line\n', filename='file1') ] relevant_file = 'file1' relevant_line_in_file = '' expected = (0, 0) assert find_line_number_of_relevant_line_in_file(diff_files, relevant_file, relevant_line_in_file) == expected # Tests that the function returns (-1, -1) when the relevant_line_in_file is found in the patch but it is a deleted line def test_relevant_line_found_but_deleted(self): diff_files = [ FilePatchInfo(base_file='file1', head_file='file1', patch='@@ -1,2 +1,1 @@\n-line1\n-relevant_line\n', filename='file1') ] relevant_file = 'file1' relevant_line_in_file = 'relevant_line' expected = (-1, -1) assert find_line_number_of_relevant_line_in_file(diff_files, relevant_file, relevant_line_in_file) == expected ================================================ FILE: tests/unittest/test_fix_json_escape_char.py ================================================ from pr_agent.algo.utils import fix_json_escape_char class TestFixJsonEscapeChar: def test_valid_json(self): """Return unchanged when input JSON is already valid""" text = '{"a": 1, "b": "ok"}' expected_output = {"a": 1, "b": "ok"} assert fix_json_escape_char(text) == expected_output def test_single_control_char(self): """Remove a single ASCII control-character""" text = '{"msg": "hel\x01lo"}' expected_output = {"msg": "hel lo"} assert fix_json_escape_char(text) == expected_output def test_multiple_control_chars(self): """Remove multiple control-characters recursively""" text = '{"x": "A\x02B\x03C"}' expected_output = {"x": "A B C"} assert fix_json_escape_char(text) == expected_output ================================================ FILE: tests/unittest/test_fix_output.py ================================================ # Generated by CodiumAI from pr_agent.algo.utils import try_fix_json class TestTryFixJson: # Tests that JSON with complete 'Code suggestions' section returns expected output def test_incomplete_code_suggestions(self): review = '{"PR Analysis": {"Main theme": "xxx", "Type of PR": "Bug fix"}, "PR Feedback": {"General PR suggestions": "..., `xxx`...", "Code suggestions": [{"relevant file": "xxx.py", "suggestion content": "xxx [important]"}, {"suggestion number": 2, "relevant file": "yyy.py", "suggestion content": "yyy [incomp...' # noqa: E501 expected_output = { 'PR Analysis': { 'Main theme': 'xxx', 'Type of PR': 'Bug fix' }, 'PR Feedback': { 'General PR suggestions': '..., `xxx`...', 'Code suggestions': [ { 'relevant file': 'xxx.py', 'suggestion content': 'xxx [important]' } ] } } assert try_fix_json(review) == expected_output def test_incomplete_code_suggestions_new_line(self): review = '{"PR Analysis": {"Main theme": "xxx", "Type of PR": "Bug fix"}, "PR Feedback": {"General PR suggestions": "..., `xxx`...", "Code suggestions": [{"relevant file": "xxx.py", "suggestion content": "xxx [important]"} \n\t, {"suggestion number": 2, "relevant file": "yyy.py", "suggestion content": "yyy [incomp...' # noqa: E501 expected_output = { 'PR Analysis': { 'Main theme': 'xxx', 'Type of PR': 'Bug fix' }, 'PR Feedback': { 'General PR suggestions': '..., `xxx`...', 'Code suggestions': [ { 'relevant file': 'xxx.py', 'suggestion content': 'xxx [important]' } ] } } assert try_fix_json(review) == expected_output def test_incomplete_code_suggestions_many_close_brackets(self): review = '{"PR Analysis": {"Main theme": "xxx", "Type of PR": "Bug fix"}, "PR Feedback": {"General PR suggestions": "..., `xxx`...", "Code suggestions": [{"relevant file": "xxx.py", "suggestion content": "xxx [important]"} \n, {"suggestion number": 2, "relevant file": "yyy.py", "suggestion content": "yyy }, [}\n ,incomp.} ,..' # noqa: E501 expected_output = { 'PR Analysis': { 'Main theme': 'xxx', 'Type of PR': 'Bug fix' }, 'PR Feedback': { 'General PR suggestions': '..., `xxx`...', 'Code suggestions': [ { 'relevant file': 'xxx.py', 'suggestion content': 'xxx [important]' } ] } } assert try_fix_json(review) == expected_output def test_incomplete_code_suggestions_relevant_file(self): review = '{"PR Analysis": {"Main theme": "xxx", "Type of PR": "Bug fix"}, "PR Feedback": {"General PR suggestions": "..., `xxx`...", "Code suggestions": [{"relevant file": "xxx.py", "suggestion content": "xxx [important]"}, {"suggestion number": 2, "relevant file": "yyy.p' # noqa: E501 expected_output = { 'PR Analysis': { 'Main theme': 'xxx', 'Type of PR': 'Bug fix' }, 'PR Feedback': { 'General PR suggestions': '..., `xxx`...', 'Code suggestions': [ { 'relevant file': 'xxx.py', 'suggestion content': 'xxx [important]' } ] } } assert try_fix_json(review) == expected_output ================================================ FILE: tests/unittest/test_fresh_vars_functionality.py ================================================ """ Comprehensive unit tests for Dynaconf fresh_vars functionality. These tests verify that the fresh_vars feature works correctly with the custom_merge_loader, particularly for the GitLab credentials use case where values should be reloaded from disk on each access rather than being cached. The tests are designed to detect if fresh_vars is broken due to custom loader changes, such as those introduced in https://github.com/qodo-ai/pr-agent/pull/2087. """ import os import tempfile from pathlib import Path from unittest.mock import patch import pytest from dynaconf import Dynaconf # Import get_settings at module level to complete the import chain and avoid circular import issues # This ensures pr_agent.config_loader is fully loaded before custom_merge_loader is used in tests from pr_agent.config_loader import get_settings # noqa: F401 # Module-level helper function def create_dynaconf_with_custom_loader(temp_dir, secrets_file): """ Create a Dynaconf instance matching the production configuration. This mimics the config_loader.py setup with: - core_loaders disabled - custom_merge_loader and env_loader enabled - merge_enabled = True Note: fresh_vars should be configured via FRESH_VARS_FOR_DYNACONF environment variable, which is the only way to configure it in pr-agent. Args: temp_dir: Temporary directory path secrets_file: Path to secrets file Returns: Dynaconf instance configured like production """ return Dynaconf( core_loaders=[], loaders=["pr_agent.custom_merge_loader", "dynaconf.loaders.env_loader"], root_path=temp_dir, merge_enabled=True, envvar_prefix=False, load_dotenv=False, settings_files=[str(secrets_file)], ) class TestFreshVarsGitLabScenario: """ Test fresh_vars functionality for the GitLab credentials use case. This class tests the specific scenario where: - FRESH_VARS_FOR_DYNACONF='["GITLAB"]' is set - .secrets.toml contains gitlab.personal_access_token and gitlab.shared_secret - Values should be reloaded from disk on each access (not cached) """ def setup_method(self): """Set up temporary directory and files for each test.""" self.temp_dir = tempfile.mkdtemp() self.secrets_file = Path(self.temp_dir) / ".secrets.toml" def teardown_method(self): """Clean up temporary files after each test.""" import shutil if hasattr(self, "temp_dir") and Path(self.temp_dir).exists(): shutil.rmtree(self.temp_dir) def create_secrets_toml(self, personal_access_token="initial_token", shared_secret="initial_secret"): """ Create a .secrets.toml file with GitLab credentials. Args: personal_access_token: The GitLab personal access token value shared_secret: The GitLab shared secret value """ content = f"""[gitlab] personal_access_token = "{personal_access_token}" shared_secret = "{shared_secret}" """ self.secrets_file.write_text(content) def test_gitlab_personal_access_token_reload(self): """ Test that gitlab.personal_access_token is reloaded when marked as fresh. This is the critical test for the user's use case. It verifies that: 1. Initial value is loaded correctly 2. After modifying the file, the new value is returned (not cached) 3. This works with the custom_merge_loader """ # Create initial secrets file self.create_secrets_toml(personal_access_token="token_v1", shared_secret="secret_v1") # Set FRESH_VARS_FOR_DYNACONF environment variable (the only way to configure fresh_vars in pr-agent) with patch.dict(os.environ, {"FRESH_VARS_FOR_DYNACONF": '["GITLAB"]'}): # Create Dynaconf with GITLAB marked as fresh via env var settings = create_dynaconf_with_custom_loader(self.temp_dir, self.secrets_file) # First access - should return initial value first_token = settings.GITLAB.PERSONAL_ACCESS_TOKEN assert first_token == "token_v1", "Initial personal_access_token should be 'token_v1'" # Modify the secrets file self.create_secrets_toml(personal_access_token="token_v2_updated", shared_secret="secret_v1") # Second access - should return NEW value (not cached) second_token = settings.GITLAB.PERSONAL_ACCESS_TOKEN assert second_token == "token_v2_updated", ( "After file modification, personal_access_token should be reloaded to 'token_v2_updated'" ) # Verify the values are different (fresh_vars working) assert first_token != second_token, "fresh_vars should cause values to be reloaded, not cached" def test_gitlab_multiple_fields_reload(self): """ Test that both gitlab fields reload together when GITLAB is marked as fresh. This verifies that fresh_vars works correctly when multiple fields in the same section are modified simultaneously. """ # Create initial secrets file self.create_secrets_toml(personal_access_token="token_v1", shared_secret="secret_v1") # Set FRESH_VARS_FOR_DYNACONF environment variable with patch.dict(os.environ, {"FRESH_VARS_FOR_DYNACONF": '["GITLAB"]'}): # Create Dynaconf with GITLAB marked as fresh via env var settings = create_dynaconf_with_custom_loader(self.temp_dir, self.secrets_file) # First access - both fields first_token = settings.GITLAB.PERSONAL_ACCESS_TOKEN first_secret = settings.GITLAB.SHARED_SECRET assert first_token == "token_v1" assert first_secret == "secret_v1" # Modify both fields in the secrets file self.create_secrets_toml( personal_access_token="token_v2_both_updated", shared_secret="secret_v2_both_updated" ) # Second access - both fields should be updated second_token = settings.GITLAB.PERSONAL_ACCESS_TOKEN second_secret = settings.GITLAB.SHARED_SECRET assert second_token == "token_v2_both_updated", "personal_access_token should be reloaded" assert second_secret == "secret_v2_both_updated", "shared_secret should be reloaded" # Verify both fields were reloaded assert first_token != second_token, "personal_access_token should not be cached" assert first_secret != second_secret, "shared_secret should not be cached" class TestFreshVarsCustomLoaderIntegration: """ Test fresh_vars integration with custom_merge_loader. These tests verify that fresh_vars works correctly when using the custom_merge_loader instead of Dynaconf's default core loaders. """ def setup_method(self): """Set up temporary directory and files for each test.""" self.temp_dir = tempfile.mkdtemp() self.secrets_file = Path(self.temp_dir) / ".secrets.toml" def teardown_method(self): """Clean up temporary files after each test.""" import shutil if hasattr(self, "temp_dir") and Path(self.temp_dir).exists(): shutil.rmtree(self.temp_dir) def create_secrets_toml(self, personal_access_token="initial_token", shared_secret="initial_secret"): """Create a .secrets.toml file with GitLab credentials.""" content = f"""[gitlab] personal_access_token = "{personal_access_token}" shared_secret = "{shared_secret}" """ self.secrets_file.write_text(content) def test_fresh_vars_without_core_loaders(self): """ Critical test: Verify fresh_vars works when core_loaders are disabled. This test detects if the bug exists where fresh_vars stops working when core_loaders=[] is set. This is the key issue that may have been introduced by the custom_merge_loader changes. Expected behavior: - If fresh_vars works: second_value != first_value - If fresh_vars is broken: second_value == first_value (cached) """ # Create initial secrets file self.create_secrets_toml(personal_access_token="token_before_bug_test") # Set FRESH_VARS_FOR_DYNACONF environment variable with patch.dict(os.environ, {"FRESH_VARS_FOR_DYNACONF": '["GITLAB"]'}): # Create Dynaconf WITHOUT core loaders but WITH fresh_vars via env var settings = create_dynaconf_with_custom_loader(self.temp_dir, self.secrets_file) # First access first_value = settings.GITLAB.PERSONAL_ACCESS_TOKEN assert first_value == "token_before_bug_test", "Initial value should be loaded correctly" # Modify the file self.create_secrets_toml(personal_access_token="token_after_bug_test") # Second access - THIS IS THE CRITICAL CHECK second_value = settings.GITLAB.PERSONAL_ACCESS_TOKEN # If this assertion fails, fresh_vars is broken with custom_merge_loader assert second_value == "token_after_bug_test", ( "CRITICAL: fresh_vars should reload the value even with core_loaders=[]" ) assert first_value != second_value, "CRITICAL: Values should be different, indicating fresh_vars is working" def test_custom_loader_respects_fresh_vars(self): """ Test that custom_merge_loader respects the fresh_vars configuration. Verifies that when a section is marked as fresh, the custom loader doesn't cache values from that section. """ # Create initial secrets file with multiple sections content = """[gitlab] personal_access_token = "gitlab_token_v1" [github] user_token = "github_token_v1" """ self.secrets_file.write_text(content) # Set FRESH_VARS_FOR_DYNACONF environment variable (only GITLAB) with patch.dict(os.environ, {"FRESH_VARS_FOR_DYNACONF": '["GITLAB"]'}): # Create Dynaconf with only GITLAB marked as fresh via env var settings = create_dynaconf_with_custom_loader(self.temp_dir, self.secrets_file) # Access both sections gitlab_token_1 = settings.GITLAB.PERSONAL_ACCESS_TOKEN github_token_1 = settings.GITHUB.USER_TOKEN # Modify both sections content = """[gitlab] personal_access_token = "gitlab_token_v2" [github] user_token = "github_token_v2" """ self.secrets_file.write_text(content) # Access again gitlab_token_2 = settings.GITLAB.PERSONAL_ACCESS_TOKEN github_token_2 = settings.GITHUB.USER_TOKEN # GITLAB should be reloaded (marked as fresh) assert gitlab_token_2 == "gitlab_token_v2", "GITLAB section should be reloaded (marked as fresh)" assert gitlab_token_1 != gitlab_token_2, "GITLAB values should not be cached" # GITHUB should be cached (not marked as fresh) assert github_token_2 == "github_token_v1", "GITHUB section should be cached (not marked as fresh)" assert github_token_1 == github_token_2, "GITHUB values should be cached" class TestFreshVarsBasicFunctionality: """ Test basic fresh_vars functionality and edge cases. These tests verify fundamental fresh_vars behavior and ensure the feature works as expected in various scenarios. """ def setup_method(self): """Set up temporary directory and files for each test.""" self.temp_dir = tempfile.mkdtemp() self.secrets_file = Path(self.temp_dir) / ".secrets.toml" def teardown_method(self): """Clean up temporary files after each test.""" import shutil if hasattr(self, "temp_dir") and Path(self.temp_dir).exists(): shutil.rmtree(self.temp_dir) def create_secrets_toml(self, personal_access_token="initial_token"): """Create a .secrets.toml file with GitLab credentials.""" content = f"""[gitlab] personal_access_token = "{personal_access_token}" """ self.secrets_file.write_text(content) def test_gitlab_credentials_not_cached_when_fresh(self): """ Test that GitLab credentials are not cached when marked as fresh. This verifies the core requirement: when GITLAB is in fresh_vars, accessing the credentials multiple times should reload from disk each time, not return a cached value. """ # Create initial secrets file self.create_secrets_toml(personal_access_token="no_cache_v1") # Set FRESH_VARS_FOR_DYNACONF environment variable with patch.dict(os.environ, {"FRESH_VARS_FOR_DYNACONF": '["GITLAB"]'}): # Create Dynaconf with GITLAB marked as fresh via env var settings = create_dynaconf_with_custom_loader(self.temp_dir, self.secrets_file) # Access the token multiple times before modification access_1 = settings.GITLAB.PERSONAL_ACCESS_TOKEN access_2 = settings.GITLAB.PERSONAL_ACCESS_TOKEN access_3 = settings.GITLAB.PERSONAL_ACCESS_TOKEN # All should return the same value (file hasn't changed) assert access_1 == access_2 == access_3 == "no_cache_v1", ( "Multiple accesses before modification should return same value" ) # Modify the file self.create_secrets_toml(personal_access_token="no_cache_v2") # Access again - should get new value immediately access_4 = settings.GITLAB.PERSONAL_ACCESS_TOKEN assert access_4 == "no_cache_v2", "First access after modification should return new value" # Verify no caching occurred assert access_1 != access_4, "Value should change after file modification (no caching)" # Modify again self.create_secrets_toml(personal_access_token="no_cache_v3") # Access again - should get newest value access_5 = settings.GITLAB.PERSONAL_ACCESS_TOKEN assert access_5 == "no_cache_v3", "Second modification should also be detected" # Verify the progression assert access_1 != access_4 != access_5, "Each modification should result in a different value (no caching)" def test_fresh_vars_works_with_default_loaders(self): """ Test that fresh_vars works correctly with Dynaconf's default core loaders. This is a control test to prove that fresh_vars functionality works as expected when using the standard Dynaconf configuration (with core_loaders). This helps isolate the bug to the custom_merge_loader configuration. """ # Create initial secrets file self.create_secrets_toml(personal_access_token="default_v1") # Create Dynaconf with DEFAULT loaders (not custom_merge_loader) settings = Dynaconf( # Use default core_loaders (don't disable them) root_path=self.temp_dir, merge_enabled=True, envvar_prefix=False, load_dotenv=False, settings_files=[str(self.secrets_file)], fresh_vars=["GITLAB"], ) # First access first_value = settings.GITLAB.PERSONAL_ACCESS_TOKEN assert first_value == "default_v1" # Modify file self.create_secrets_toml(personal_access_token="default_v2") # Second access - should be reloaded with default loaders second_value = settings.GITLAB.PERSONAL_ACCESS_TOKEN assert second_value == "default_v2", ( "With default loaders, fresh_vars SHOULD work correctly. " "If this test fails, the issue is not specific to custom_merge_loader." ) assert first_value != second_value, "Values should be different when using default loaders with fresh_vars" if __name__ == "__main__": pytest.main([__file__, "-v"]) ================================================ FILE: tests/unittest/test_get_max_tokens.py ================================================ import pytest import pr_agent.algo.utils as utils from pr_agent.algo.utils import MAX_TOKENS, get_max_tokens class TestGetMaxTokens: # Test if the file is in MAX_TOKENS def test_model_max_tokens(self, monkeypatch): fake_settings = type('', (), { 'config': type('', (), { 'custom_model_max_tokens': 0, 'max_model_tokens': 0 })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) model = "gpt-3.5-turbo" expected = MAX_TOKENS[model] assert get_max_tokens(model) == expected @pytest.mark.parametrize("model", ["gpt-5.4", "gpt-5.4-2026-03-05"]) def test_gpt54_model_max_tokens(self, monkeypatch, model): fake_settings = type('', (), { 'config': type('', (), { 'custom_model_max_tokens': 0, 'max_model_tokens': 0 })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) assert get_max_tokens(model) == 272000 # Test situations where the model is not registered and exists as a custom model def test_model_has_custom(self, monkeypatch): fake_settings = type('', (), { 'config': type('', (), { 'custom_model_max_tokens': 5000, 'max_model_tokens': 0 # 제한 없음 })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) model = "custom-model" expected = 5000 assert get_max_tokens(model) == expected @pytest.mark.parametrize("model", [ "gpt-5.1-codex", "gpt-5.2-codex", "gpt-5.3-codex", ]) def test_gpt_codex_models_max_tokens(self, monkeypatch, model): fake_settings = type('', (), { 'config': type('', (), { 'custom_model_max_tokens': 0, 'max_model_tokens': 0 })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) expected = MAX_TOKENS[model] assert get_max_tokens(model) == expected def test_model_not_max_tokens_and_not_has_custom(self, monkeypatch): fake_settings = type('', (), { 'config': type('', (), { 'custom_model_max_tokens': 0, 'max_model_tokens': 0 })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) model = "custom-model" with pytest.raises(Exception): get_max_tokens(model) def test_model_max_tokens_with__limit(self, monkeypatch): fake_settings = type('', (), { 'config': type('', (), { 'custom_model_max_tokens': 0, 'max_model_tokens': 10000 })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) model = "gpt-3.5-turbo" # this model setting is 160000 expected = 10000 assert get_max_tokens(model) == expected @pytest.mark.parametrize("model", [ "gemini/gemini-3-flash-preview", "vertex_ai/gemini-3-flash-preview", "gemini/gemini-3-pro-preview", "vertex_ai/gemini-3-pro-preview", "gemini/gemini-3.1-pro-preview", "vertex_ai/gemini-3.1-pro-preview", ]) def test_gemini_3_and_3_1_pro_preview(self, monkeypatch, model): fake_settings = type("", (), { "config": type("", (), { "custom_model_max_tokens": 0, "max_model_tokens": 0, })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) assert get_max_tokens(model) == 1048576 @pytest.mark.parametrize( "model", [ "anthropic/claude-opus-4-6", "claude-opus-4-6", "vertex_ai/claude-opus-4-6", "bedrock/anthropic.claude-opus-4-6-v1:0", "bedrock/global.anthropic.claude-opus-4-6-v1:0", "bedrock/us.anthropic.claude-opus-4-6-v1:0", ], ) def test_claude_opus_4_6_model_max_tokens(self, monkeypatch, model): fake_settings = type('', (), { 'config': type('', (), { 'custom_model_max_tokens': 0, 'max_model_tokens': 0 })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) assert get_max_tokens(model) == 200000 @pytest.mark.parametrize( "model", [ "anthropic/claude-sonnet-4-6", "claude-sonnet-4-6", "vertex_ai/claude-sonnet-4-6", "bedrock/anthropic.claude-sonnet-4-6", "bedrock/global.anthropic.claude-sonnet-4-6", "bedrock/us.anthropic.claude-sonnet-4-6", "bedrock/au.anthropic.claude-sonnet-4-6", "bedrock/eu.anthropic.claude-sonnet-4-6", "bedrock/jp.anthropic.claude-sonnet-4-6", ], ) def test_claude_sonnet_4_6_model_max_tokens(self, monkeypatch, model): fake_settings = type('', (), { 'config': type('', (), { 'custom_model_max_tokens': 0, 'max_model_tokens': 0 })() })() monkeypatch.setattr(utils, "get_settings", lambda: fake_settings) assert get_max_tokens(model) == 200000 ================================================ FILE: tests/unittest/test_gitea_provider.py ================================================ from io import BytesIO from unittest.mock import MagicMock, patch class TestGiteaProvider: @patch('pr_agent.git_providers.gitea_provider.get_settings') @patch('pr_agent.git_providers.gitea_provider.giteapy.ApiClient') def test_gitea_provider_auth_header(self, mock_api_client_cls, mock_get_settings): # Setup settings settings = MagicMock() settings.get.side_effect = lambda k, d=None: { 'GITEA.URL': 'https://gitea.example.com', 'GITEA.PERSONAL_ACCESS_TOKEN': 'test-token', 'GITEA.REPO_SETTING': None, 'GITEA.SKIP_SSL_VERIFICATION': False, 'GITEA.SSL_CA_CERT': None }.get(k, d) mock_get_settings.return_value = settings # Setup ApiClient mock mock_api_client = mock_api_client_cls.return_value # Mock configuration object on client mock_api_client.configuration.api_key = {'Authorization': 'token test-token'} # Mock responses for calls made during initialization def call_api_side_effect(path, method, **kwargs): mock_resp = MagicMock() if 'files' in path: # get_change_file_pull_request mock_resp.data = BytesIO(b'[]') return mock_resp if 'commits' in path: mock_resp.data = BytesIO(b'[]') return mock_resp # Default fallback mock_resp.data = BytesIO(b'{}') return mock_resp mock_api_client.call_api.side_effect = call_api_side_effect from pr_agent.git_providers.gitea_provider import RepoApi client = mock_api_client repo_api = RepoApi(client) # Now test methods independently # 1. get_change_file_pull_request mock_api_client.reset_mock() mock_resp = MagicMock() mock_resp.data = BytesIO(b'[]') mock_api_client.call_api.return_value = mock_resp repo_api.get_change_file_pull_request('owner', 'repo', 123) args, kwargs = mock_api_client.call_api.call_args assert '/repos/owner/repo/pulls/123/files' in args[0] assert kwargs.get('auth_settings') == ['AuthorizationHeaderToken'] assert 'token=' not in args[0] # 2. get_pull_request_diff mock_api_client.reset_mock() mock_resp = MagicMock() mock_resp.data = BytesIO(b'diff content') mock_api_client.call_api.return_value = mock_resp repo_api.get_pull_request_diff('owner', 'repo', 123) args, kwargs = mock_api_client.call_api.call_args assert args[0] == '/repos/owner/repo/pulls/123.diff' assert kwargs.get('auth_settings') == ['AuthorizationHeaderToken'] # 3. get_languages mock_api_client.reset_mock() mock_resp.data = BytesIO(b'{"Python": 100}') mock_api_client.call_api.return_value = mock_resp repo_api.get_languages('owner', 'repo') args, kwargs = mock_api_client.call_api.call_args assert args[0] == '/repos/owner/repo/languages' assert kwargs.get('auth_settings') == ['AuthorizationHeaderToken'] # 4. get_file_content mock_api_client.reset_mock() mock_resp.data = BytesIO(b'content') mock_api_client.call_api.return_value = mock_resp repo_api.get_file_content('owner', 'repo', 'sha1', 'file.txt') args, kwargs = mock_api_client.call_api.call_args assert args[0] == '/repos/owner/repo/raw/file.txt' assert kwargs.get('query_params') == [('ref', 'sha1')] assert kwargs.get('auth_settings') == ['AuthorizationHeaderToken'] # 5. get_pr_commits mock_api_client.reset_mock() mock_resp.data = BytesIO(b'[]') mock_api_client.call_api.return_value = mock_resp repo_api.get_pr_commits('owner', 'repo', 123) args, kwargs = mock_api_client.call_api.call_args assert args[0] == '/repos/owner/repo/pulls/123/commits' assert kwargs.get('auth_settings') == ['AuthorizationHeaderToken'] ================================================ FILE: tests/unittest/test_github_action_output.py ================================================ import json import os from pr_agent.algo.utils import get_settings, github_action_output class TestGitHubOutput: def test_github_action_output_enabled(self, monkeypatch, tmp_path): get_settings().set('GITHUB_ACTION_CONFIG.ENABLE_OUTPUT', True) monkeypatch.setenv('GITHUB_OUTPUT', str(tmp_path / 'output')) output_data = {'key1': {'value1': 1, 'value2': 2}} key_name = 'key1' github_action_output(output_data, key_name) with open(str(tmp_path / 'output'), 'r') as f: env_value = f.read() actual_key = env_value.split('=')[0] actual_data = json.loads(env_value.split('=')[1]) assert actual_key == key_name assert actual_data == output_data[key_name] def test_github_action_output_disabled(self, monkeypatch, tmp_path): get_settings().set('GITHUB_ACTION_CONFIG.ENABLE_OUTPUT', False) monkeypatch.setenv('GITHUB_OUTPUT', str(tmp_path / 'output')) output_data = {'key1': {'value1': 1, 'value2': 2}} key_name = 'key1' github_action_output(output_data, key_name) assert not os.path.exists(str(tmp_path / 'output')) def test_github_action_output_notset(self, monkeypatch, tmp_path): # not set config monkeypatch.setenv('GITHUB_OUTPUT', str(tmp_path / 'output')) output_data = {'key1': {'value1': 1, 'value2': 2}} key_name = 'key1' github_action_output(output_data, key_name) assert not os.path.exists(str(tmp_path / 'output')) def test_github_action_output_error_case(self, monkeypatch, tmp_path): monkeypatch.setenv('GITHUB_OUTPUT', str(tmp_path / 'output')) output_data = None # invalid data key_name = 'key1' github_action_output(output_data, key_name) assert not os.path.exists(str(tmp_path / 'output')) ================================================ FILE: tests/unittest/test_gitlab_provider.py ================================================ from unittest.mock import MagicMock, patch import pytest from gitlab import Gitlab from gitlab.exceptions import GitlabGetError from gitlab.v4.objects import Project, ProjectFile from pr_agent.git_providers.gitlab_provider import GitLabProvider class TestGitLabProvider: """Test suite for GitLab provider functionality.""" @pytest.fixture def mock_gitlab_client(self): client = MagicMock() return client @pytest.fixture def mock_project(self): project = MagicMock() return project @pytest.fixture def gitlab_provider(self, mock_gitlab_client, mock_project): with patch('pr_agent.git_providers.gitlab_provider.gitlab.Gitlab', return_value=mock_gitlab_client), \ patch('pr_agent.git_providers.gitlab_provider.get_settings') as mock_settings: mock_settings.return_value.get.side_effect = lambda key, default=None: { "GITLAB.URL": "https://gitlab.com", "GITLAB.PERSONAL_ACCESS_TOKEN": "fake_token" }.get(key, default) mock_gitlab_client.projects.get.return_value = mock_project provider = GitLabProvider("https://gitlab.com/test/repo/-/merge_requests/1") provider.gl = mock_gitlab_client provider.id_project = "test/repo" return provider def test_get_pr_file_content_success(self, gitlab_provider, mock_project): mock_file = MagicMock(ProjectFile) mock_file.decode.return_value = "# Changelog\n\n## v1.0.0\n- Initial release" mock_project.files.get.return_value = mock_file content = gitlab_provider.get_pr_file_content("CHANGELOG.md", "main") assert content == "# Changelog\n\n## v1.0.0\n- Initial release" mock_project.files.get.assert_called_once_with("CHANGELOG.md", "main") mock_file.decode.assert_called_once() def test_get_pr_file_content_with_bytes(self, gitlab_provider, mock_project): mock_file = MagicMock(ProjectFile) mock_file.decode.return_value = b"# Changelog\n\n## v1.0.0\n- Initial release" mock_project.files.get.return_value = mock_file content = gitlab_provider.get_pr_file_content("CHANGELOG.md", "main") assert content == "# Changelog\n\n## v1.0.0\n- Initial release" mock_project.files.get.assert_called_once_with("CHANGELOG.md", "main") def test_get_pr_file_content_file_not_found(self, gitlab_provider, mock_project): mock_project.files.get.side_effect = GitlabGetError("404 Not Found") content = gitlab_provider.get_pr_file_content("CHANGELOG.md", "main") assert content == "" mock_project.files.get.assert_called_once_with("CHANGELOG.md", "main") def test_get_pr_file_content_other_exception(self, gitlab_provider, mock_project): mock_project.files.get.side_effect = Exception("Network error") content = gitlab_provider.get_pr_file_content("CHANGELOG.md", "main") assert content == "" def test_create_or_update_pr_file_create_new(self, gitlab_provider, mock_project): mock_project.files.get.side_effect = GitlabGetError("404 Not Found") mock_file = MagicMock() mock_project.files.create.return_value = mock_file new_content = "# Changelog\n\n## v1.1.0\n- New feature" commit_message = "Add CHANGELOG.md" gitlab_provider.create_or_update_pr_file( "CHANGELOG.md", "feature-branch", new_content, commit_message ) mock_project.files.get.assert_called_once_with("CHANGELOG.md", "feature-branch") mock_project.files.create.assert_called_once_with({ 'file_path': 'CHANGELOG.md', 'branch': 'feature-branch', 'content': new_content, 'commit_message': commit_message, }) def test_create_or_update_pr_file_update_existing(self, gitlab_provider, mock_project): mock_file = MagicMock(ProjectFile) mock_file.decode.return_value = "# Old changelog content" mock_project.files.get.return_value = mock_file new_content = "# New changelog content" commit_message = "Update CHANGELOG.md" gitlab_provider.create_or_update_pr_file( "CHANGELOG.md", "feature-branch", new_content, commit_message ) mock_project.files.get.assert_called_once_with("CHANGELOG.md", "feature-branch") mock_file.content = new_content mock_file.save.assert_called_once_with(branch="feature-branch", commit_message=commit_message) def test_create_or_update_pr_file_update_exception(self, gitlab_provider, mock_project): mock_project.files.get.side_effect = Exception("Network error") with pytest.raises(Exception): gitlab_provider.create_or_update_pr_file( "CHANGELOG.md", "feature-branch", "content", "message" ) def test_has_create_or_update_pr_file_method(self, gitlab_provider): assert hasattr(gitlab_provider, "create_or_update_pr_file") assert callable(getattr(gitlab_provider, "create_or_update_pr_file")) def test_method_signature_compatibility(self, gitlab_provider): import inspect sig = inspect.signature(gitlab_provider.create_or_update_pr_file) params = list(sig.parameters.keys()) expected_params = ['file_path', 'branch', 'contents', 'message'] assert params == expected_params @pytest.mark.parametrize("content,expected", [ ("simple text", "simple text"), (b"bytes content", "bytes content"), ("", ""), (b"", ""), ("unicode: café", "unicode: café"), (b"unicode: caf\xc3\xa9", "unicode: café"), ]) def test_content_encoding_handling(self, gitlab_provider, mock_project, content, expected): mock_file = MagicMock(ProjectFile) mock_file.decode.return_value = content mock_project.files.get.return_value = mock_file result = gitlab_provider.get_pr_file_content("test.md", "main") assert result == expected def test_get_gitmodules_map_parsing(self, gitlab_provider, mock_project): gitlab_provider.id_project = "1" gitlab_provider.mr = MagicMock() gitlab_provider.mr.target_branch = "main" file_obj = MagicMock(ProjectFile) file_obj.decode.return_value = ( "[submodule \"libs/a\"]\n" " path = \"libs/a\"\n" " url = \"https://gitlab.com/a.git\"\n" "[submodule \"libs/b\"]\n" " path = libs/b\n" " url = git@gitlab.com:b.git\n" ) mock_project.files.get.return_value = file_obj gitlab_provider.gl.projects.get.return_value = mock_project result = gitlab_provider._get_gitmodules_map() assert result == { "libs/a": "https://gitlab.com/a.git", "libs/b": "git@gitlab.com:b.git", } def test_project_by_path_requires_exact_match(self, gitlab_provider): gitlab_provider.gl.projects.get.reset_mock() gitlab_provider.gl.projects.get.side_effect = Exception("not found") fake = MagicMock() fake.path_with_namespace = "other/group/repo" gitlab_provider.gl.projects.list.return_value = [fake] result = gitlab_provider._project_by_path("group/repo") assert result is None assert gitlab_provider.gl.projects.get.call_count == 2 def test_compare_submodule_cached(self, gitlab_provider): proj = MagicMock() proj.repository_compare.return_value = {"diffs": [{"diff": "d"}]} with patch.object(gitlab_provider, "_project_by_path", return_value=proj) as m_pbp: first = gitlab_provider._compare_submodule("grp/repo", "old", "new") second = gitlab_provider._compare_submodule("grp/repo", "old", "new") assert first == second == [{"diff": "d"}] m_pbp.assert_called_once_with("grp/repo") proj.repository_compare.assert_called_once_with("old", "new") ================================================ FILE: tests/unittest/test_gitlab_webhook_port.py ================================================ import os from unittest import mock os.environ.setdefault("GITLAB__URL", "https://gitlab.example.com") import pr_agent.servers.gitlab_webhook as gitlab_webhook def test_start_uses_port_env(monkeypatch): monkeypatch.setenv("PORT", "4567") with mock.patch.object(gitlab_webhook.uvicorn, "run") as mock_run: gitlab_webhook.start() _, kwargs = mock_run.call_args assert kwargs["port"] == 4567 assert kwargs["host"] == "0.0.0.0" def test_start_invalid_port_env(monkeypatch): monkeypatch.setenv("PORT", "not-a-number") with mock.patch.object(gitlab_webhook.uvicorn, "run") as mock_run: gitlab_webhook.start() _, kwargs = mock_run.call_args assert kwargs["port"] == 3000 def test_start_default_port(monkeypatch): monkeypatch.delenv("PORT", raising=False) with mock.patch.object(gitlab_webhook.uvicorn, "run") as mock_run: gitlab_webhook.start() _, kwargs = mock_run.call_args assert kwargs["port"] == 3000 def test_start_invalid_port_range(monkeypatch): monkeypatch.setenv("PORT", "70000") with mock.patch.object(gitlab_webhook.uvicorn, "run") as mock_run: gitlab_webhook.start() _, kwargs = mock_run.call_args assert kwargs["port"] == 3000 ================================================ FILE: tests/unittest/test_handle_patch_deletions.py ================================================ # Generated by CodiumAI import logging from pr_agent.algo.git_patch_processing import handle_patch_deletions from pr_agent.config_loader import get_settings """ Code Analysis Objective: The objective of the function is to handle entire file or deletion patches and return the patch after omitting the deletion hunks. Inputs: - patch: a string representing the patch to be handled - original_file_content_str: a string representing the original content of the file - new_file_content_str: a string representing the new content of the file - file_name: a string representing the name of the file Flow: - If new_file_content_str is empty, set patch to "File was deleted" and return it - Otherwise, split patch into lines and omit the deletion hunks using the omit_deletion_hunks function - If the resulting patch is different from the original patch, log a message and set patch to the new patch - Return the resulting patch Outputs: - A string representing the patch after omitting the deletion hunks Additional aspects: - The function uses the settings from the configuration files to determine the verbosity level of the logging messages - The omit_deletion_hunks function is called to remove the deletion hunks from the patch - The function handles the case where the new_file_content_str is empty by setting the patch to "File was deleted" """ class TestHandlePatchDeletions: # Tests that handle_patch_deletions returns the original patch when new_file_content_str is not empty def test_handle_patch_deletions_happy_path_new_file_content_exists(self): patch = '--- a/file.py\n+++ b/file.py\n@@ -1,2 +1,2 @@\n-foo\n-bar\n+baz\n' original_file_content_str = 'foo\nbar\n' new_file_content_str = 'foo\nbaz\n' file_name = 'file.py' assert handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file_name) == patch.rstrip() # Tests that handle_patch_deletions returns 'File was deleted' when new_file_content_str is empty def test_handle_patch_deletions_edge_case_new_file_content_empty(self): patch = '--- a/file.py\n+++ b/file.py\n@@ -1,2 +1,2 @@\n-foo\n-bar\n' original_file_content_str = 'foo\nbar\n' new_file_content_str = '' file_name = 'file.py' assert handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file_name) is None # Tests that handle_patch_deletions returns the original patch when patch and patch_new are equal def test_handle_patch_deletions_edge_case_patch_and_patch_new_are_equal(self): patch = '--- a/file.py\n+++ b/file.py\n@@ -1,2 +1,2 @@\n-foo\n-bar\n' original_file_content_str = 'foo\nbar\n' new_file_content_str = 'foo\nbar\n' file_name = 'file.py' assert handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file_name).rstrip() == patch.rstrip() # Tests that handle_patch_deletions returns the modified patch when patch and patch_new are not equal def test_handle_patch_deletions_edge_case_patch_and_patch_new_are_not_equal(self): patch = '--- a/file.py\n+++ b/file.py\n@@ -1,2 +1,2 @@\n-foo\n-bar\n' original_file_content_str = 'foo\nbar\n' new_file_content_str = 'foo\nbaz\n' file_name = 'file.py' expected_patch = '--- a/file.py\n+++ b/file.py\n@@ -1,2 +1,2 @@\n-foo\n-bar' assert handle_patch_deletions(patch, original_file_content_str, new_file_content_str, file_name) == expected_patch ================================================ FILE: tests/unittest/test_ignore_repositories.py ================================================ import pytest from pr_agent.config_loader import get_settings from pr_agent.servers.bitbucket_app import should_process_pr_logic as bitbucket_should_process_pr_logic from pr_agent.servers.github_app import should_process_pr_logic as github_should_process_pr_logic from pr_agent.servers.gitlab_webhook import should_process_pr_logic as gitlab_should_process_pr_logic def make_bitbucket_payload(full_name): return { "data": { "pullrequest": { "title": "Test PR", "source": {"branch": {"name": "feature/test"}}, "destination": { "branch": {"name": "main"}, "repository": {"full_name": full_name} } }, "actor": {"username": "user", "type": "user"} } } def make_github_body(full_name): return { "pull_request": {}, "repository": {"full_name": full_name}, "sender": {"login": "user"} } def make_gitlab_body(full_name): return { "object_attributes": {"title": "Test MR"}, "project": {"path_with_namespace": full_name} } PROVIDERS = [ ("github", github_should_process_pr_logic, make_github_body), ("bitbucket", bitbucket_should_process_pr_logic, make_bitbucket_payload), ("gitlab", gitlab_should_process_pr_logic, make_gitlab_body), ] class TestIgnoreRepositories: def setup_method(self): get_settings().set("CONFIG.IGNORE_REPOSITORIES", []) @pytest.mark.parametrize("provider_name, provider_func, body_func", PROVIDERS) def test_should_ignore_matching_repository(self, provider_name, provider_func, body_func): get_settings().set("CONFIG.IGNORE_REPOSITORIES", ["org/repo-to-ignore"]) body = { "pull_request": {}, "repository": {"full_name": "org/repo-to-ignore"}, "sender": {"login": "user"} } result = provider_func(body_func(body["repository"]["full_name"])) # print(f"DEBUG: Provider={provider_name}, test_should_ignore_matching_repository, result={result}") assert result is False, f"{provider_name}: PR from ignored repository should be ignored (return False)" @pytest.mark.parametrize("provider_name, provider_func, body_func", PROVIDERS) def test_should_not_ignore_non_matching_repository(self, provider_name, provider_func, body_func): get_settings().set("CONFIG.IGNORE_REPOSITORIES", ["org/repo-to-ignore"]) body = { "pull_request": {}, "repository": {"full_name": "org/other-repo"}, "sender": {"login": "user"} } result = provider_func(body_func(body["repository"]["full_name"])) # print(f"DEBUG: Provider={provider_name}, test_should_not_ignore_non_matching_repository, result={result}") assert result is True, f"{provider_name}: PR from non-ignored repository should not be ignored (return True)" @pytest.mark.parametrize("provider_name, provider_func, body_func", PROVIDERS) def test_should_not_ignore_when_config_empty(self, provider_name, provider_func, body_func): get_settings().set("CONFIG.IGNORE_REPOSITORIES", []) body = { "pull_request": {}, "repository": {"full_name": "org/repo-to-ignore"}, "sender": {"login": "user"} } result = provider_func(body_func(body["repository"]["full_name"])) # print(f"DEBUG: Provider={provider_name}, test_should_not_ignore_when_config_empty, result={result}") assert result is True, f"{provider_name}: PR should not be ignored if ignore_repositories config is empty" ================================================ FILE: tests/unittest/test_language_handler.py ================================================ # Generated by CodiumAI from pr_agent.algo.language_handler import sort_files_by_main_languages """ Code Analysis Objective: The objective of the function is to sort a list of files by their main language, putting the files that are in the main language first and the rest of the files after. It takes in a dictionary of languages and their sizes, and a list of files. Inputs: - languages: a dictionary containing the languages and their sizes - files: a list of files Flow: 1. Sort the languages by their size in descending order 2. Get all extensions for the languages 3. Filter out files with bad extensions 4. Sort files by their extension, putting the files that are in the main extension first and the rest of the files after 5. Map languages_sorted to their respective files 6. Append the files to the files_sorted list 7. Append the rest of the files to the files_sorted list under the "Other" language category 8. Return the files_sorted list Outputs: - files_sorted: a list of dictionaries containing the language and its respective files Additional aspects: - The function uses a language_extension_map dictionary to map the languages to their respective extensions - The function uses the filter_bad_extensions function to filter out files with bad extensions - The function uses a rest_files dictionary to store the files that do not belong to any of the main extensions """ class TestSortFilesByMainLanguages: # Tests that files are sorted by main language, with files in main language first and the rest after def test_happy_path_sort_files_by_main_languages(self): languages = {'Python': 10, 'Java': 5, 'C++': 3} files = [ type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})(), type('', (object,), {'filename': 'file3.cpp'})(), type('', (object,), {'filename': 'file4.py'})(), type('', (object,), {'filename': 'file5.py'})() ] expected_output = [ {'language': 'Python', 'files': [files[0], files[3], files[4]]}, {'language': 'Java', 'files': [files[1]]}, {'language': 'C++', 'files': [files[2]]}, {'language': 'Other', 'files': []} ] assert sort_files_by_main_languages(languages, files) == expected_output # Tests that function handles empty languages dictionary def test_edge_case_empty_languages(self): languages = {} files = [ type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})() ] expected_output = [{'language': 'Other', 'files': files}] assert sort_files_by_main_languages(languages, files) == expected_output # Tests that function handles empty files list def test_edge_case_empty_files(self): languages = {'Python': 10, 'Java': 5} files = [] expected_output = [ {'language': 'Other', 'files': []} ] assert sort_files_by_main_languages(languages, files) == expected_output # Tests that function handles languages with no extensions def test_edge_case_languages_with_no_extensions(self): languages = {'Python': 10, 'Java': 5, 'C++': 3} files = [ type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})(), type('', (object,), {'filename': 'file3.cpp'})(), type('', (object,), {'filename': 'file3.test'})() ] expected_output = [ {'language': 'Python', 'files': [files[0]]}, {'language': 'Java', 'files': [files[1]]}, {'language': 'C++', 'files': [files[2]]}, {'language': 'Other', 'files': [files[3]]} ] assert sort_files_by_main_languages(languages, files) == expected_output # Tests the behavior of the function when all files have bad extensions and only one new valid file is added. def test_edge_case_files_with_bad_extensions_only(self): languages = {'Python': 10, 'Java': 5, 'C++': 3} files = [ type('', (object,), {'filename': 'file1.csv'})(), type('', (object,), {'filename': 'file2.pdf'})(), type('', (object,), {'filename': 'file3.py'})() # new valid file ] expected_output = [{'language': 'Python', 'files': [files[2]]}, {'language': 'Other', 'files': []}] assert sort_files_by_main_languages(languages, files) == expected_output # Tests general behaviour of function def test_general_behaviour_sort_files_by_main_languages(self): languages = {'Python': 10, 'Java': 5, 'C++': 3} files = [ type('', (object,), {'filename': 'file1.py'})(), type('', (object,), {'filename': 'file2.java'})(), type('', (object,), {'filename': 'file3.cpp'})(), type('', (object,), {'filename': 'file4.py'})(), type('', (object,), {'filename': 'file5.py'})(), type('', (object,), {'filename': 'file6.py'})(), type('', (object,), {'filename': 'file7.java'})(), type('', (object,), {'filename': 'file8.cpp'})(), type('', (object,), {'filename': 'file9.py'})() ] expected_output = [ {'language': 'Python', 'files': [files[0], files[3], files[4], files[5], files[8]]}, {'language': 'Java', 'files': [files[1], files[6]]}, {'language': 'C++', 'files': [files[2], files[7]]}, {'language': 'Other', 'files': []} ] assert sort_files_by_main_languages(languages, files) == expected_output ================================================ FILE: tests/unittest/test_litellm_reasoning_effort.py ================================================ from unittest.mock import AsyncMock, MagicMock, call, patch import pytest import pr_agent.algo.ai_handlers.litellm_ai_handler as litellm_handler from pr_agent.algo.ai_handlers.litellm_ai_handler import LiteLLMAIHandler def create_mock_settings(reasoning_effort_value): """Create a fake settings object with configurable reasoning_effort.""" return type('', (), { 'config': type('', (), { 'reasoning_effort': reasoning_effort_value, 'ai_timeout': 120, 'custom_reasoning_model': False, 'max_model_tokens': 32000, 'verbosity_level': 0, 'get': lambda self, key, default=None: default })(), 'litellm': type('', (), { 'get': lambda self, key, default=None: default })(), 'get': lambda self, key, default=None: default })() def create_mock_acompletion_response(): """Create a properly structured mock response for acompletion.""" mock_response = MagicMock() mock_response.__getitem__ = lambda self, key: { "choices": [{"message": {"content": "test"}, "finish_reason": "stop"}] }[key] mock_response.dict.return_value = {"choices": [{"message": {"content": "test"}, "finish_reason": "stop"}]} return mock_response @pytest.fixture def mock_logger(): """Mock logger to capture info and warning calls.""" with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.get_logger') as mock_log: mock_log_instance = MagicMock() mock_log.return_value = mock_log_instance yield mock_log_instance class TestLiteLLMReasoningEffort: """ Comprehensive test suite for GPT-5 reasoning_effort configuration handling. Tests cover: - Valid reasoning_effort values for GPT-5 models - Invalid reasoning_effort values with warning logging - Model detection (GPT-5 vs non-GPT-5) - Model suffix handling (_thinking vs regular) - Default fallback logic - Logging behavior (info and warning messages) - thinking_kwargs_gpt5 structure validation """ # ========== Group 1: Valid Configuration Tests ========== @pytest.mark.asyncio async def test_gpt5_valid_reasoning_effort_none(self, monkeypatch, mock_logger): """Test GPT-5 with valid reasoning_effort='none' from config.""" fake_settings = create_mock_settings("none") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) # Mock acompletion to capture kwargs with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Verify the call was made with correct reasoning_effort assert mock_completion.called call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "none" assert "reasoning_effort" in call_kwargs["allowed_openai_params"] # Verify info log mock_logger.info.assert_any_call("Using reasoning_effort='none' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_valid_reasoning_effort_low(self, monkeypatch, mock_logger): """Test GPT-5 with valid reasoning_effort='low' from config.""" fake_settings = create_mock_settings("low") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "low" assert "reasoning_effort" in call_kwargs["allowed_openai_params"] mock_logger.info.assert_any_call("Using reasoning_effort='low' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_valid_reasoning_effort_medium(self, monkeypatch, mock_logger): """Test GPT-5 with valid reasoning_effort='medium' from config.""" fake_settings = create_mock_settings("medium") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" assert "reasoning_effort" in call_kwargs["allowed_openai_params"] mock_logger.info.assert_any_call("Using reasoning_effort='medium' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_valid_reasoning_effort_high(self, monkeypatch, mock_logger): """Test GPT-5 with valid reasoning_effort='high' from config.""" fake_settings = create_mock_settings("high") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "high" assert "reasoning_effort" in call_kwargs["allowed_openai_params"] mock_logger.info.assert_any_call("Using reasoning_effort='high' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_valid_reasoning_effort_xhigh(self, monkeypatch, mock_logger): """Test GPT-5 with valid reasoning_effort='xhigh' from config.""" fake_settings = create_mock_settings("xhigh") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5.2", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "xhigh" assert "reasoning_effort" in call_kwargs["allowed_openai_params"] mock_logger.info.assert_any_call("Using reasoning_effort='xhigh' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_valid_reasoning_effort_minimal(self, monkeypatch, mock_logger): """Test GPT-5 with valid reasoning_effort='minimal' from config.""" fake_settings = create_mock_settings("minimal") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "minimal" assert "reasoning_effort" in call_kwargs["allowed_openai_params"] mock_logger.info.assert_any_call("Using reasoning_effort='minimal' for GPT-5 model") # ========== Group 2: Invalid Configuration Tests ========== @pytest.mark.asyncio async def test_gpt5_invalid_reasoning_effort_with_warning(self, monkeypatch, mock_logger): """Test GPT-5 with invalid reasoning_effort logs warning and uses default.""" fake_settings = create_mock_settings("extreme") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Should default to 'medium' call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" # Verify warning logged mock_logger.warning.assert_called_once() warning_call = mock_logger.warning.call_args[0][0] assert "Invalid reasoning_effort 'extreme' in config" in warning_call assert "Valid values:" in warning_call # Verify info log mock_logger.info.assert_any_call("Using reasoning_effort='medium' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_invalid_reasoning_effort_thinking_model(self, monkeypatch, mock_logger): """Test GPT-5 _thinking model with invalid reasoning_effort defaults to 'medium'.""" fake_settings = create_mock_settings("invalid_value") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07_thinking", system="test system", user="test user" ) # Should default to 'medium' (no special handling for _thinking models) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" # Verify warning logged mock_logger.warning.assert_called_once() # Verify info log mock_logger.info.assert_any_call("Using reasoning_effort='medium' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_none_config_defaults_to_medium(self, monkeypatch, mock_logger): """Test GPT-5 with None config defaults to 'medium' without warning.""" fake_settings = create_mock_settings(None) monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Should default to 'medium' call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" # No warning should be logged mock_logger.warning.assert_not_called() # Info log should show effort mock_logger.info.assert_any_call("Using reasoning_effort='medium' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_none_config_thinking_model_defaults_to_medium(self, monkeypatch, mock_logger): """Test GPT-5 _thinking model with None config defaults to 'medium' without warning.""" fake_settings = create_mock_settings(None) monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07_thinking", system="test system", user="test user" ) # Should default to 'medium' (no special handling for _thinking models) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" # No warning should be logged mock_logger.warning.assert_not_called() # Info log mock_logger.info.assert_any_call("Using reasoning_effort='medium' for GPT-5 model") # ========== Group 3: Model Detection Tests ========== @pytest.mark.asyncio async def test_gpt5_model_detection_various_versions(self, monkeypatch, mock_logger): """Test various GPT-5 model version strings trigger the reasoning_effort logic.""" fake_settings = create_mock_settings("medium") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) gpt5_models = [ "gpt-5-2025-08-07", "gpt-5.1", "gpt-5.4", "gpt-5.4-2026-03-05", "gpt-5-turbo", "gpt-5.1-codex", "gpt-5.3-codex", ] for model in gpt5_models: with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model=model, system="test system", user="test user" ) # All should trigger GPT-5 logic call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" assert "reasoning_effort" in call_kwargs["allowed_openai_params"] @pytest.mark.asyncio async def test_non_gpt5_model_no_thinking_kwargs(self, monkeypatch, mock_logger): """Test non-GPT-5 models do not trigger reasoning_effort logic.""" fake_settings = create_mock_settings("high") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) non_gpt5_models = ["gpt-4o", "gpt-4-turbo", "claude-3-5-sonnet"] for model in non_gpt5_models: with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model=model, system="test system", user="test user" ) # Should not have reasoning_effort in kwargs call_kwargs = mock_completion.call_args[1] assert "reasoning_effort" not in call_kwargs @pytest.mark.asyncio async def test_gpt5_suffix_removal(self, monkeypatch, mock_logger): """Test that _thinking suffix is properly removed from model name.""" fake_settings = create_mock_settings("low") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5_thinking", system="test system", user="test user" ) # Model should be transformed to openai/gpt-5 call_kwargs = mock_completion.call_args[1] assert call_kwargs["model"] == "openai/gpt-5" # ========== Group 4: Model Suffix Handling Tests ========== @pytest.mark.asyncio async def test_gpt5_thinking_suffix_default_medium(self, monkeypatch, mock_logger): """Test _thinking suffix models default to 'medium' when config is None.""" fake_settings = create_mock_settings(None) monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07_thinking", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" mock_logger.info.assert_any_call("Using reasoning_effort='medium' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_regular_suffix_default_medium(self, monkeypatch, mock_logger): """Test regular GPT-5 models default to 'medium' when config is None.""" fake_settings = create_mock_settings(None) monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" mock_logger.info.assert_any_call("Using reasoning_effort='medium' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_thinking_suffix_config_overrides_default(self, monkeypatch, mock_logger): """Test that config overrides the default for _thinking models.""" fake_settings = create_mock_settings("high") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07_thinking", system="test system", user="test user" ) # Should use 'high' from config, not 'medium' default call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "high" mock_logger.info.assert_any_call("Using reasoning_effort='high' for GPT-5 model") # ========== Group 5: Logging Behavior Tests ========== @pytest.mark.asyncio async def test_gpt5_info_logging_configured_value(self, monkeypatch, mock_logger): """Test info log when using configured value.""" fake_settings = create_mock_settings("low") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Verify log mock_logger.info.assert_any_call("Using reasoning_effort='low' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_info_logging_default_value(self, monkeypatch, mock_logger): """Test info log when using default value.""" fake_settings = create_mock_settings(None) monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Verify log mock_logger.info.assert_any_call("Using reasoning_effort='medium' for GPT-5 model") @pytest.mark.asyncio async def test_gpt5_warning_only_for_invalid_non_none(self, monkeypatch, mock_logger): """Test warning logged only for invalid non-None values.""" # Test None - should not warn fake_settings = create_mock_settings(None) monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # No warning for None mock_logger.warning.assert_not_called() # Reset mock mock_logger.reset_mock() # Test invalid string - should warn fake_settings = create_mock_settings("ultra") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Warning should be logged for invalid value mock_logger.warning.assert_called_once() # ========== Group 6: Structure Validation Tests ========== @pytest.mark.asyncio async def test_thinking_kwargs_gpt5_structure(self, monkeypatch, mock_logger): """Test that thinking_kwargs_gpt5 has correct structure.""" fake_settings = create_mock_settings("medium") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] # Verify structure assert "reasoning_effort" in call_kwargs assert call_kwargs["reasoning_effort"] == "medium" assert "allowed_openai_params" in call_kwargs assert isinstance(call_kwargs["allowed_openai_params"], list) assert "reasoning_effort" in call_kwargs["allowed_openai_params"] @pytest.mark.asyncio async def test_thinking_kwargs_not_created_for_non_gpt5(self, monkeypatch, mock_logger): """Test that thinking_kwargs_gpt5 is not created for non-GPT-5 models.""" fake_settings = create_mock_settings("high") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-4o", system="test system", user="test user" ) call_kwargs = mock_completion.call_args[1] # Should not have reasoning_effort keys assert "reasoning_effort" not in call_kwargs assert call_kwargs.get("allowed_openai_params") is None or "reasoning_effort" not in call_kwargs.get("allowed_openai_params", []) # ========== Group 7: Edge Cases ========== @pytest.mark.asyncio async def test_empty_string_reasoning_effort(self, monkeypatch, mock_logger): """Test empty string reasoning_effort is treated as invalid.""" fake_settings = create_mock_settings("") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Should default to 'medium' and log warning call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" mock_logger.warning.assert_called_once() @pytest.mark.asyncio async def test_case_sensitive_reasoning_effort(self, monkeypatch, mock_logger): """Test that reasoning_effort validation is case-sensitive.""" fake_settings = create_mock_settings("LOW") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Should treat uppercase as invalid and default to 'medium' call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" mock_logger.warning.assert_called_once() @pytest.mark.asyncio async def test_whitespace_reasoning_effort(self, monkeypatch, mock_logger): """Test that reasoning_effort with whitespace is treated as invalid.""" fake_settings = create_mock_settings(" low ") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5-2025-08-07", system="test system", user="test user" ) # Should treat value with whitespace as invalid call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" mock_logger.warning.assert_called_once() @pytest.mark.asyncio async def test_gpt5_prefix_match_only(self, monkeypatch, mock_logger): """Test that model.startswith('gpt-5') matching behavior. Note: The current logic uses startswith('gpt-5'), which means models like 'gpt-50' will also match (since 'gpt-50'.startswith('gpt-5') is True). This test documents the current behavior. """ fake_settings = create_mock_settings("medium") monkeypatch.setattr(litellm_handler, "get_settings", lambda: fake_settings) # Test gpt-50 (will match due to startswith logic) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-50", system="test system", user="test user" ) # Due to startswith('gpt-5'), gpt-50 will match and have reasoning_effort call_kwargs = mock_completion.call_args[1] assert "reasoning_effort" in call_kwargs # Reset mock mock_logger.reset_mock() # Test gpt-5 (should match) with patch('pr_agent.algo.ai_handlers.litellm_ai_handler.acompletion', new_callable=AsyncMock) as mock_completion: mock_completion.return_value = create_mock_acompletion_response() handler = LiteLLMAIHandler() await handler.chat_completion( model="gpt-5", system="test system", user="test user" ) # Should have reasoning_effort call_kwargs = mock_completion.call_args[1] assert call_kwargs["reasoning_effort"] == "medium" ================================================ FILE: tests/unittest/test_load_yaml.py ================================================ # Generated by CodiumAI import pytest import yaml from yaml.scanner import ScannerError from pr_agent.algo.utils import load_yaml class TestLoadYaml: # Tests that load_yaml loads a valid YAML string def test_load_valid_yaml(self): yaml_str = 'name: John Smith\nage: 35' expected_output = {'name': 'John Smith', 'age': 35} assert load_yaml(yaml_str) == expected_output def test_load_invalid_yaml1(self): yaml_str = \ '''\ PR Analysis: Main theme: Enhancing the `/describe` command prompt by adding title and description Type of PR: Enhancement Relevant tests: No Focused PR: Yes, the PR is focused on enhancing the `/describe` command prompt. PR Feedback: General suggestions: The PR seems to be well-structured and focused on a specific enhancement. However, it would be beneficial to add tests to ensure the new feature works as expected. Code feedback: - relevant file: pr_agent/settings/pr_description_prompts.toml suggestion: Consider using a more descriptive variable name than 'user' for the command prompt. A more descriptive name would make the code more readable and maintainable. [medium] relevant line: user="""PR Info: aaa Security concerns: No''' with pytest.raises(ScannerError): yaml.safe_load(yaml_str) expected_output = {'PR Analysis': {'Main theme': 'Enhancing the `/describe` command prompt by adding title and description', 'Type of PR': 'Enhancement', 'Relevant tests': False, 'Focused PR': 'Yes, the PR is focused on enhancing the `/describe` command prompt.'}, 'PR Feedback': {'General suggestions': 'The PR seems to be well-structured and focused on a specific enhancement. However, it would be beneficial to add tests to ensure the new feature works as expected.', 'Code feedback': [{'relevant file': 'pr_agent/settings/pr_description_prompts.toml\n', 'suggestion': "Consider using a more descriptive variable name than 'user' for the command prompt. A more descriptive name would make the code more readable and maintainable. [medium]", 'relevant line': 'user="""PR Info: aaa\n'}], 'Security concerns': False}} assert load_yaml(yaml_str) == expected_output def test_load_invalid_yaml2(self): yaml_str = '''\ - relevant file: src/app.py: suggestion content: The print statement is outside inside the if __name__ ==: \ ''' with pytest.raises(ScannerError): yaml.safe_load(yaml_str) expected_output = [{'relevant file': 'src/app.py:\n', 'suggestion content': 'The print statement is outside inside the if __name__ ==:'}] assert load_yaml(yaml_str) == expected_output ================================================ FILE: tests/unittest/test_parse_code_suggestion.py ================================================ # Generated by CodiumAI from pr_agent.algo.utils import parse_code_suggestion """ Code Analysis Objective: The objective of the function is to convert a dictionary into a markdown format. The function takes in a dictionary as input and recursively converts it into a markdown format. The function is specifically designed to handle dictionaries that contain code suggestions. Inputs: - output_data: a dictionary containing the data to be converted into markdown format Flow: - Initialize an empty string variable called markdown_text - Create a dictionary of emojis to be used in the markdown format - Iterate through the items in the input dictionary - If the value is empty, skip to the next item - If the value is a dictionary, recursively call the function with the value as input - If the value is a list, iterate through the list and add each item to the markdown format - If the value is not 'n/a', add it to the markdown format - If the key is 'code suggestions', call the parse_code_suggestion function to handle the list of code suggestions - Return the markdown format as a string Outputs: - markdown_text: a string containing the input dictionary converted into markdown format Additional aspects: - The function uses the textwrap module to indent code examples in the markdown format - The parse_code_suggestion function is called to handle the 'code suggestions' key in the input dictionary - The function uses emojis to add visual cues to the markdown format """ class TestParseCodeSuggestion: # Tests that function returns empty string when input is an empty dictionary def test_empty_dict(self): input_data = {} expected_output = "\n" # modified to expect a newline character assert parse_code_suggestion(input_data) == expected_output # Tests that function returns correct output when 'before' or 'after' key has a non-string value def test_non_string_before_or_after(self): input_data = { "Code example": { "Before": 123, "After": ["a", "b", "c"] } } expected_output = " - **Code example:**\n - **Before:**\n ```\n 123\n ```\n - **After:**\n ```\n ['a', 'b', 'c']\n ```\n\n" # noqa: E501 assert parse_code_suggestion(input_data) == expected_output # Tests that function returns correct output when input dictionary does not have 'code example' key def test_no_code_example_key(self): code_suggestions = { 'suggestion': 'Suggestion 1', 'description': 'Description 1', 'before': 'Before 1', 'after': 'After 1' } expected_output = ' **suggestion:** Suggestion 1 \n **description:** Description 1 \n **before:** Before 1 \n **after:** After 1 \n\n' # noqa: E501 assert parse_code_suggestion(code_suggestions) == expected_output # Tests that function returns correct output when input dictionary has 'code example' key def test_with_code_example_key(self): code_suggestions = { 'suggestion': 'Suggestion 2', 'description': 'Description 2', 'code example': { 'before': 'Before 2', 'after': 'After 2' } } expected_output = ' **suggestion:** Suggestion 2 \n **description:** Description 2 \n - **code example:**\n - **before:**\n ```\n Before 2\n ```\n - **after:**\n ```\n After 2\n ```\n\n' # noqa: E501 assert parse_code_suggestion(code_suggestions) == expected_output ================================================ FILE: tests/unittest/test_pr_update_changelog.py ================================================ from unittest.mock import AsyncMock, MagicMock, patch import pytest from pr_agent.tools.pr_update_changelog import PRUpdateChangelog class TestPRUpdateChangelog: """Test suite for the PR Update Changelog functionality.""" @pytest.fixture def mock_git_provider(self): """Create a mock git provider.""" provider = MagicMock() provider.get_pr_branch.return_value = "feature-branch" provider.get_pr_file_content.return_value = "" provider.pr.title = "Test PR" provider.get_pr_description.return_value = "Test description" provider.get_commit_messages.return_value = "fix: test commit" provider.get_languages.return_value = {"Python": 80, "JavaScript": 20} provider.get_files.return_value = ["test.py", "test.js"] return provider @pytest.fixture def mock_ai_handler(self): """Create a mock AI handler.""" handler = MagicMock() handler.chat_completion = AsyncMock(return_value=("Test changelog entry", "stop")) return handler @pytest.fixture def changelog_tool(self, mock_git_provider, mock_ai_handler): """Create a PRUpdateChangelog instance with mocked dependencies.""" with patch('pr_agent.tools.pr_update_changelog.get_git_provider', return_value=lambda url: mock_git_provider), \ patch('pr_agent.tools.pr_update_changelog.get_main_pr_language', return_value="Python"), \ patch('pr_agent.tools.pr_update_changelog.get_settings') as mock_settings: # Configure mock settings mock_settings.return_value.pr_update_changelog.push_changelog_changes = False mock_settings.return_value.pr_update_changelog.extra_instructions = "" mock_settings.return_value.pr_update_changelog_prompt.system = "System prompt" mock_settings.return_value.pr_update_changelog_prompt.user = "User prompt" mock_settings.return_value.config.temperature = 0.2 tool = PRUpdateChangelog("https://gitlab.com/test/repo/-/merge_requests/1", ai_handler=lambda: mock_ai_handler) return tool def test_get_changelog_file_with_existing_content(self, changelog_tool, mock_git_provider): """Test retrieving existing changelog content.""" # Arrange existing_content = "# Changelog\n\n## v1.0.0\n- Initial release\n- Bug fixes" mock_git_provider.get_pr_file_content.return_value = existing_content # Act changelog_tool._get_changelog_file() # Assert assert changelog_tool.changelog_file == existing_content assert "# Changelog" in changelog_tool.changelog_file_str def test_get_changelog_file_with_no_existing_content(self, changelog_tool, mock_git_provider): """Test handling when no changelog file exists.""" # Arrange mock_git_provider.get_pr_file_content.return_value = "" # Act changelog_tool._get_changelog_file() # Assert assert changelog_tool.changelog_file == "" assert "Example:" in changelog_tool.changelog_file_str # Default template def test_get_changelog_file_with_bytes_content(self, changelog_tool, mock_git_provider): """Test handling when git provider returns bytes instead of string.""" # Arrange content_bytes = b"# Changelog\n\n## v1.0.0\n- Initial release" mock_git_provider.get_pr_file_content.return_value = content_bytes # Act changelog_tool._get_changelog_file() # Assert assert isinstance(changelog_tool.changelog_file, str) assert changelog_tool.changelog_file == "# Changelog\n\n## v1.0.0\n- Initial release" def test_get_changelog_file_with_exception(self, changelog_tool, mock_git_provider): """Test handling exceptions during file retrieval.""" # Arrange mock_git_provider.get_pr_file_content.side_effect = Exception("Network error") # Act changelog_tool._get_changelog_file() # Assert assert changelog_tool.changelog_file == "" assert changelog_tool.changelog_file_str == "" # Exception should result in empty string, no default template def test_prepare_changelog_update_with_existing_content(self, changelog_tool): """Test preparing changelog update when existing content exists.""" # Arrange changelog_tool.prediction = "## v1.1.0\n- New feature\n- Bug fix" changelog_tool.changelog_file = "# Changelog\n\n## v1.0.0\n- Initial release" changelog_tool.commit_changelog = True # Act new_content, answer = changelog_tool._prepare_changelog_update() # Assert assert new_content.startswith("## v1.1.0\n- New feature\n- Bug fix\n\n") assert "# Changelog\n\n## v1.0.0\n- Initial release" in new_content assert answer == "## v1.1.0\n- New feature\n- Bug fix" def test_prepare_changelog_update_without_existing_content(self, changelog_tool): """Test preparing changelog update when no existing content.""" # Arrange changelog_tool.prediction = "## v1.0.0\n- Initial release" changelog_tool.changelog_file = "" changelog_tool.commit_changelog = True # Act new_content, answer = changelog_tool._prepare_changelog_update() # Assert assert new_content == "## v1.0.0\n- Initial release" assert answer == "## v1.0.0\n- Initial release" def test_prepare_changelog_update_no_commit(self, changelog_tool): """Test preparing changelog update when not committing.""" # Arrange changelog_tool.prediction = "## v1.1.0\n- New feature" changelog_tool.changelog_file = "" changelog_tool.commit_changelog = False # Act new_content, answer = changelog_tool._prepare_changelog_update() # Assert assert new_content == "## v1.1.0\n- New feature" assert "to commit the new content" in answer @pytest.mark.asyncio async def test_run_without_push_support(self, changelog_tool, mock_git_provider): """Test running changelog update when git provider doesn't support pushing.""" # Arrange delattr(mock_git_provider, 'create_or_update_pr_file') # Remove the method changelog_tool.commit_changelog = True with patch('pr_agent.tools.pr_update_changelog.get_settings') as mock_settings: mock_settings.return_value.pr_update_changelog.push_changelog_changes = True mock_settings.return_value.config.publish_output = True # Act await changelog_tool.run() # Assert mock_git_provider.publish_comment.assert_called_once() assert "not currently supported" in str(mock_git_provider.publish_comment.call_args) @pytest.mark.asyncio async def test_run_with_push_support(self, changelog_tool, mock_git_provider): """Test running changelog update when git provider supports pushing.""" # Arrange mock_git_provider.create_or_update_pr_file = MagicMock() changelog_tool.commit_changelog = True changelog_tool.prediction = "## v1.1.0\n- New feature" with patch('pr_agent.tools.pr_update_changelog.get_settings') as mock_settings, \ patch('pr_agent.tools.pr_update_changelog.retry_with_fallback_models') as mock_retry, \ patch('pr_agent.tools.pr_update_changelog.sleep'): mock_settings.return_value.pr_update_changelog.push_changelog_changes = True mock_settings.return_value.pr_update_changelog.get.return_value = True mock_settings.return_value.config.publish_output = True mock_settings.return_value.config.git_provider = "gitlab" mock_retry.return_value = None # Act await changelog_tool.run() # Assert mock_git_provider.create_or_update_pr_file.assert_called_once() call_args = mock_git_provider.create_or_update_pr_file.call_args assert call_args[1]['file_path'] == 'CHANGELOG.md' assert call_args[1]['branch'] == 'feature-branch' def test_push_changelog_update(self, changelog_tool, mock_git_provider): """Test the push changelog update functionality.""" # Arrange mock_git_provider.create_or_update_pr_file = MagicMock() mock_git_provider.get_pr_branch.return_value = "feature-branch" new_content = "# Updated changelog content" answer = "Changes made" with patch('pr_agent.tools.pr_update_changelog.get_settings') as mock_settings, \ patch('pr_agent.tools.pr_update_changelog.sleep'): mock_settings.return_value.pr_update_changelog.get.return_value = True # Act changelog_tool._push_changelog_update(new_content, answer) # Assert mock_git_provider.create_or_update_pr_file.assert_called_once_with( file_path="CHANGELOG.md", branch="feature-branch", contents=new_content, message="[skip ci] Update CHANGELOG.md" ) def test_gitlab_provider_method_detection(self, changelog_tool, mock_git_provider): """Test that the tool correctly detects GitLab provider method availability.""" # Arrange mock_git_provider.create_or_update_pr_file = MagicMock() # Act & Assert assert hasattr(mock_git_provider, "create_or_update_pr_file") @pytest.mark.parametrize("existing_content,new_entry,expected_order", [ ( "# Changelog\n\n## v1.0.0\n- Old feature", "## v1.1.0\n- New feature", ["v1.1.0", "v1.0.0"] ), ( "", "## v1.0.0\n- Initial release", ["v1.0.0"] ), ( "Some existing content", "## v1.0.0\n- New entry", ["v1.0.0", "Some existing content"] ), ]) def test_changelog_order_preservation(self, changelog_tool, existing_content, new_entry, expected_order): """Test that changelog entries are properly ordered (newest first).""" # Arrange changelog_tool.prediction = new_entry changelog_tool.changelog_file = existing_content changelog_tool.commit_changelog = True # Act new_content, _ = changelog_tool._prepare_changelog_update() # Assert for i, expected in enumerate(expected_order[:-1]): current_pos = new_content.find(expected) next_pos = new_content.find(expected_order[i + 1]) assert current_pos < next_pos, f"Expected {expected} to come before {expected_order[i + 1]}" ================================================ FILE: tests/unittest/test_secret_provider_factory.py ================================================ from unittest.mock import MagicMock, patch import pytest from pr_agent.secret_providers import get_secret_provider class TestSecretProviderFactory: def test_get_secret_provider_none_when_not_configured(self): with patch('pr_agent.secret_providers.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = None mock_get_settings.return_value = settings result = get_secret_provider() assert result is None def test_get_secret_provider_google_cloud_storage(self): with patch('pr_agent.secret_providers.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = "google_cloud_storage" settings.config.secret_provider = "google_cloud_storage" mock_get_settings.return_value = settings with patch('pr_agent.secret_providers.google_cloud_storage_secret_provider.GoogleCloudStorageSecretProvider') as MockProvider: mock_instance = MagicMock() MockProvider.return_value = mock_instance result = get_secret_provider() assert result is mock_instance MockProvider.assert_called_once() def test_get_secret_provider_aws_secrets_manager(self): with patch('pr_agent.secret_providers.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = "aws_secrets_manager" settings.config.secret_provider = "aws_secrets_manager" mock_get_settings.return_value = settings with patch('pr_agent.secret_providers.aws_secrets_manager_provider.AWSSecretsManagerProvider') as MockProvider: mock_instance = MagicMock() MockProvider.return_value = mock_instance result = get_secret_provider() assert result is mock_instance MockProvider.assert_called_once() def test_get_secret_provider_unknown_provider(self): with patch('pr_agent.secret_providers.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = "unknown_provider" settings.config.secret_provider = "unknown_provider" mock_get_settings.return_value = settings with pytest.raises(ValueError, match="Unknown SECRET_PROVIDER"): get_secret_provider() def test_get_secret_provider_initialization_error(self): with patch('pr_agent.secret_providers.get_settings') as mock_get_settings: settings = MagicMock() settings.get.return_value = "aws_secrets_manager" settings.config.secret_provider = "aws_secrets_manager" mock_get_settings.return_value = settings with patch('pr_agent.secret_providers.aws_secrets_manager_provider.AWSSecretsManagerProvider') as MockProvider: MockProvider.side_effect = Exception("Initialization failed") with pytest.raises(ValueError, match="Failed to initialize aws_secrets_manager secret provider"): get_secret_provider() ================================================ FILE: tests/unittest/test_similar_issue_non_github.py ================================================ import pytest from pr_agent.tools.pr_similar_issue import PRSimilarIssue @pytest.mark.asyncio async def test_similar_issue_non_github_publishes_message(monkeypatch): class FakeProvider: def __init__(self): self.comments = [] def publish_comment(self, body): self.comments.append(body) fake_provider = FakeProvider() class FakeSettings: class config: git_provider = "gitlab" publish_output = True monkeypatch.setattr("pr_agent.tools.pr_similar_issue.get_settings", lambda: FakeSettings) monkeypatch.setattr( "pr_agent.git_providers.get_git_provider_with_context", lambda _: fake_provider, ) tool = PRSimilarIssue("https://gitlab.example.com/group/repo/-/merge_requests/1", None) result = await tool.run() assert result == "" assert fake_provider.comments == [ "The /similar_issue tool is currently supported only for GitHub." ] @pytest.mark.asyncio async def test_similar_issue_non_github_no_publish(monkeypatch): class FakeSettings: class config: git_provider = "gitlab" publish_output = False monkeypatch.setattr("pr_agent.tools.pr_similar_issue.get_settings", lambda: FakeSettings) tool = PRSimilarIssue("https://gitlab.example.com/group/repo/-/merge_requests/1", None) result = await tool.run() assert result == "" ================================================ FILE: tests/unittest/test_try_fix_yaml.py ================================================ # Generated by CodiumAI from pr_agent.algo.utils import try_fix_yaml class TestTryFixYaml: # The function successfully parses a valid YAML string. def test_valid_yaml(self): review_text = "key: value\n" expected_output = {"key": "value"} assert try_fix_yaml(review_text) == expected_output # The function adds '|-' to 'relevant line:' if it is not already present and successfully parses the YAML string. def test_add_relevant_line(self): review_text = "relevant line: value: 3\n" expected_output = {'relevant line': 'value: 3\n'} assert try_fix_yaml(review_text) == expected_output # The function extracts YAML snippet def test_extract_snippet(self): review_text = '''\ Here is the answer in YAML format: ```yaml name: John Smith age: 35 ``` ''' expected_output = {'name': 'John Smith', 'age': 35} assert try_fix_yaml(review_text) == expected_output # The YAML string is empty. def test_empty_yaml_fixed(self): review_text = "" assert try_fix_yaml(review_text) is None # The function extracts YAML snippet def test_no_initial_yaml(self): review_text = '''\ I suggest the following: code_suggestions: - relevant_file: | src/index.ts label: | best practice - relevant_file: | src/index2.ts label: | enhancement ``` We can further improve the code by using the `const` keyword instead of `var` in the `src/index.ts` file. ''' expected_output = {'code_suggestions': [{'relevant_file': 'src/index.ts\n', 'label': 'best practice\n'}, {'relevant_file': 'src/index2.ts\n', 'label': 'enhancement'}]} assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='label') == expected_output def test_with_initial_yaml(self): review_text = '''\ I suggest the following: ``` code_suggestions: - relevant_file: | src/index.ts label: | best practice - relevant_file: | src/index2.ts label: | enhancement ``` We can further improve the code by using the `const` keyword instead of `var` in the `src/index.ts` file. ''' expected_output = {'code_suggestions': [{'relevant_file': 'src/index.ts\n', 'label': 'best practice\n'}, {'relevant_file': 'src/index2.ts\n', 'label': 'enhancement'}]} assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='label') == expected_output def test_with_brackets_yaml_content(self): review_text = '''\ { code_suggestions: - relevant_file: | src/index.ts label: | best practice - relevant_file: | src/index2.ts label: | enhancement } ''' expected_output = {'code_suggestions': [{'relevant_file': 'src/index.ts\n', 'label': 'best practice\n'}, {'relevant_file': 'src/index2.ts\n', 'label': 'enhancement'}]} assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='label') == expected_output def test_tab_indent_yaml(self): review_text = '''\ code_suggestions: - relevant_file: | src/index.ts label: | \tbest practice - relevant_file: | src/index2.ts label: | enhancement ''' expected_output = {'code_suggestions': [{'relevant_file': 'src/index.ts\n', 'label': 'best practice\n'}, {'relevant_file': 'src/index2.ts\n', 'label': 'enhancement\n'}]} assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='label') == expected_output def test_leading_plus_mark_code(self): review_text = '''\ code_suggestions: - relevant_file: | src/index.ts label: | best practice existing_code: | + var router = createBrowserRouter([ improved_code: | + const router = createBrowserRouter([ ''' expected_output = {'code_suggestions': [{ 'relevant_file': 'src/index.ts\n', 'label': 'best practice\n', 'existing_code': 'var router = createBrowserRouter([\n', 'improved_code': 'const router = createBrowserRouter([\n' }]} assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='improved_code') == expected_output def test_inconsistent_indentation_in_block_scalar_yaml(self): """ This test case represents a situation where the AI outputs the opening '{' with 5 spaces (resulting in an inferred indent level of 5), while the closing '}' is output with only 4 spaces. This inconsistency makes it impossible for the YAML parser to automatically determine the correct indent level, causing a parsing failure. The root cause may be the LLM miscounting spaces or misunderstanding the active block scalar context while generating YAML output. """ review_text = '''\ code_suggestions: - relevant_file: | tsconfig.json existing_code: | { "key1": "value1", "key2": { "subkey": "value" } } ''' expected_json = '''\ { "key1": "value1", "key2": { "subkey": "value" } } ''' expected_output = { 'code_suggestions': [{ 'relevant_file': 'tsconfig.json\n', 'existing_code': expected_json }] } assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='existing_code') == expected_output def test_inconsistent_and_insufficient_indentation_in_block_scalar_yaml(self): """ This test case reproduces a YAML parsing failure where the block scalar content generated by the AI includes inconsistent and insufficient indentation levels. The root cause may be the LLM miscounting spaces or misunderstanding the active block scalar context while generating YAML output. """ review_text = '''\ code_suggestions: - relevant_file: | tsconfig.json existing_code: | { "key1": "value1", "key2": { "subkey": "value" } } ''' expected_json = '''\ { "key1": "value1", "key2": { "subkey": "value" } } ''' expected_output = { 'code_suggestions': [{ 'relevant_file': 'tsconfig.json\n', 'existing_code': expected_json }] } assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='existing_code') == expected_output def test_wrong_indentation_code_block_scalar(self): review_text = '''\ code_suggestions: - relevant_file: | a.c existing_code: | int sum(int a, int b) { return a + b; } int sub(int a, int b) { return a - b; } ''' expected_code_block = '''\ int sum(int a, int b) { return a + b; } int sub(int a, int b) { return a - b; } ''' expected_output = {'code_suggestions': [{'relevant_file': 'a.c\n', 'existing_code': ' int sum(int a, int b) {\n return a + b;\n }\n\n int sub(int a, int b) {\n return a - b;\n }\n'}]} assert try_fix_yaml(review_text, first_key='code_suggestions', last_key='existing_code') == expected_output