Repository: eval-sys/mcpmark
Branch: main
Commit: adc5e6558f05
Files: 670
Total size: 3.1 MB
Directory structure:
gitextract_5znolca_/
├── .dockerignore
├── .editorconfig
├── .gitattributes
├── .github/
│ ├── ISSUE_TEMPLATE/
│ │ ├── 1_bug_report.yml
│ │ ├── 2_feature_request.yml
│ │ └── config.yml
│ ├── PULL_REQUEST_TEMPLATE.md
│ ├── scripts/
│ │ └── pr-comment.js
│ └── workflows/
│ └── publish-docker-image.yml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── build-docker.sh
├── cspell.config.yaml
├── docs/
│ ├── contributing/
│ │ └── make-contribution.md
│ ├── datasets/
│ │ └── task.md
│ ├── installation_and_docker_usage.md
│ ├── introduction.md
│ ├── mcp/
│ │ ├── filesystem.md
│ │ ├── github.md
│ │ ├── notion.md
│ │ ├── playwright.md
│ │ └── postgres.md
│ └── quickstart.md
├── pipeline.py
├── pyproject.toml
├── run-benchmark.sh
├── run-task.sh
├── src/
│ ├── agents/
│ │ ├── __init__.py
│ │ ├── base_agent.py
│ │ ├── mcp/
│ │ │ ├── __init__.py
│ │ │ ├── http_server.py
│ │ │ └── stdio_server.py
│ │ ├── mcpmark_agent.py
│ │ ├── react_agent.py
│ │ └── utils/
│ │ ├── __init__.py
│ │ └── token_usage.py
│ ├── aggregators/
│ │ ├── aggregate_results.py
│ │ ├── aggregate_specific_results.py
│ │ ├── aggregate_task_meta.py
│ │ └── pricing.py
│ ├── base/
│ │ ├── __init__.py
│ │ ├── login_helper.py
│ │ ├── state_manager.py
│ │ └── task_manager.py
│ ├── config/
│ │ ├── __init__.py
│ │ └── config_schema.py
│ ├── errors.py
│ ├── evaluator.py
│ ├── factory.py
│ ├── logger.py
│ ├── mcp_services/
│ │ ├── filesystem/
│ │ │ ├── __init__.py
│ │ │ ├── filesystem_login_helper.py
│ │ │ ├── filesystem_state_manager.py
│ │ │ └── filesystem_task_manager.py
│ │ ├── github/
│ │ │ ├── __init__.py
│ │ │ ├── github_login_helper.py
│ │ │ ├── github_state_manager.py
│ │ │ ├── github_task_manager.py
│ │ │ ├── repo_exporter.py
│ │ │ ├── repo_importer.py
│ │ │ └── token_pool.py
│ │ ├── insforge/
│ │ │ ├── __init__.py
│ │ │ ├── insforge_login_helper.py
│ │ │ ├── insforge_state_manager.py
│ │ │ └── insforge_task_manager.py
│ │ ├── notion/
│ │ │ ├── __init__.py
│ │ │ ├── notion_login_helper.py
│ │ │ ├── notion_state_manager.py
│ │ │ └── notion_task_manager.py
│ │ ├── playwright/
│ │ │ ├── __init__.py
│ │ │ ├── playwright_login_helper.py
│ │ │ ├── playwright_state_manager.py
│ │ │ └── playwright_task_manager.py
│ │ ├── playwright_webarena/
│ │ │ ├── playwright_login_helper.py
│ │ │ ├── playwright_state_manager.py
│ │ │ ├── playwright_task_manager.py
│ │ │ └── reddit_env_setup.md
│ │ ├── postgres/
│ │ │ ├── __init__.py
│ │ │ ├── postgres_login_helper.py
│ │ │ ├── postgres_state_manager.py
│ │ │ └── postgres_task_manager.py
│ │ └── supabase/
│ │ ├── __init__.py
│ │ ├── supabase_login_helper.py
│ │ ├── supabase_state_manager.py
│ │ └── supabase_task_manager.py
│ ├── model_config.py
│ ├── results_reporter.py
│ └── services.py
└── tasks/
├── __init__.py
├── filesystem/
│ ├── easy/
│ │ ├── .gitkeep
│ │ ├── file_context/
│ │ │ ├── file_splitting/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ ├── pattern_matching/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── uppercase/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── file_property/
│ │ │ ├── largest_rename/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── txt_merging/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── folder_structure/
│ │ │ └── structure_analysis/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── legal_document/
│ │ │ └── file_reorganize/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── papers/
│ │ │ └── papers_counting/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── student_database/
│ │ ├── duplicate_name/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── recommender_name/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── standard/
│ ├── desktop/
│ │ ├── music_report/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── project_management/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── timeline_extraction/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── desktop_template/
│ │ ├── budget_computation/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── contact_information/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── file_arrangement/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── file_context/
│ │ ├── duplicates_searching/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── file_merging/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── file_splitting/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── pattern_matching/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── uppercase/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── file_property/
│ │ ├── size_classification/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── time_classification/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── folder_structure/
│ │ ├── structure_analysis/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── structure_mirror/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── legal_document/
│ │ ├── dispute_review/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── individual_comments/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── solution_tracing/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── papers/
│ │ ├── author_folders/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── find_math_paper/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── organize_legacy_papers/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── student_database/
│ │ ├── duplicate_name/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── english_talent/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── gradebased_score/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── threestudio/
│ │ ├── code_locating/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── output_analysis/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── requirements_completion/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── votenet/
│ ├── dataset_comparison/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── debugging/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── requirements_writing/
│ ├── description.md
│ ├── meta.json
│ └── verify.py
├── github/
│ ├── easy/
│ │ ├── build-your-own-x/
│ │ │ ├── close_commented_issues/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── record_recent_commits/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── claude-code/
│ │ │ ├── add_terminal_shortcuts_doc/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ ├── thank_docker_pr_author/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── triage_missing_tool_result_issue/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── mcpmark-cicd/
│ │ │ ├── basic_ci_checks/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ ├── issue_lint_guard/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── nightly_health_check/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── missing-semester/
│ │ ├── count_translations/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── find_ga_tracking_id/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── standard/
│ ├── build_your_own_x/
│ │ ├── find_commit_date/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── find_rag_commit/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── claude-code/
│ │ ├── automated_changelog_generation/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── claude_collaboration_analysis/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── critical_issue_hotfix_workflow/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── feature_commit_tracking/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── label_color_standardization/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── easyr1/
│ │ ├── advanced_branch_strategy/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── config_parameter_audit/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── performance_regression_investigation/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── qwen3_issue_management/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── harmony/
│ │ ├── fix_conflict/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── issue_pr_commit_workflow/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── issue_tagging_pr_closure/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── multi_branch_commit_aggregation/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── release_management_workflow/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── mcpmark-cicd/
│ │ ├── deployment_status_workflow/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── issue_management_workflow/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── linting_ci_workflow/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── pr_automation_workflow/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── missing-semester/
│ ├── assign_contributor_labels/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── find_legacy_name/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── find_salient_file/
│ ├── description.md
│ ├── meta.json
│ └── verify.py
├── notion/
│ ├── easy/
│ │ ├── .gitkeep
│ │ ├── computer_science_student_dashboard/
│ │ │ ├── simple__code_snippets_go/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── simple__study_session_tracker/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── it_trouble_shooting_hub/
│ │ │ └── simple__asset_retirement_migration/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── japan_travel_planner/
│ │ │ └── simple__remove_osaka_itinerary/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── online_resume/
│ │ │ └── simple__skills_development_tracker/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── python_roadmap/
│ │ │ └── simple__expert_level_lessons/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── self_assessment/
│ │ │ └── simple__faq_column_layout/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── standard_operating_procedure/
│ │ │ └── simple__section_organization/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── team_projects/
│ │ │ └── simple__swap_tasks/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── toronto_guide/
│ │ └── simple__change_color/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── standard/
│ ├── company_in_a_box/
│ │ ├── employee_onboarding/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── goals_restructure/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── quarterly_review_dashboard/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── computer_science_student_dashboard/
│ │ ├── code_snippets_go/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── courses_internships_relation/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── study_session_tracker/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── it_trouble_shooting_hub/
│ │ ├── asset_retirement_migration/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── security_audit_ticket/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── verification_expired_update/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── japan_travel_planner/
│ │ ├── daily_itinerary_overview/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── packing_progress_summary/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── remove_osaka_itinerary/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── restaurant_expenses_sync/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── online_resume/
│ │ ├── layout_adjustment/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── projects_section_update/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── skills_development_tracker/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── work_history_addition/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── python_roadmap/
│ │ ├── expert_level_lessons/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── learning_metrics_dashboard/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── self_assessment/
│ │ ├── faq_column_layout/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── hyperfocus_analysis_report/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── numbered_list_emojis/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── standard_operating_procedure/
│ │ ├── deployment_process_sop/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── section_organization/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── team_projects/
│ │ ├── priority_tasks_table/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── swap_tasks/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── toronto_guide/
│ ├── change_color/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── weekend_adventure_planner/
│ ├── description.md
│ ├── meta.json
│ └── verify.py
├── playwright/
│ ├── easy/
│ │ └── .gitkeep
│ └── standard/
│ ├── eval_web/
│ │ ├── cloudflare_turnstile_challenge/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── extraction_table/
│ │ ├── data.csv
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── web_search/
│ ├── birth_of_arvinxu/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── r1_arxiv/
│ ├── content.txt
│ ├── description.md
│ ├── meta.json
│ └── verify.py
├── playwright_webarena/
│ ├── easy/
│ │ ├── .gitkeep
│ │ ├── reddit/
│ │ │ ├── ai_data_analyst/
│ │ │ │ ├── description.md
│ │ │ │ ├── label.txt
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ ├── llm_research_summary/
│ │ │ │ ├── description.md
│ │ │ │ ├── label.txt
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ ├── movie_reviewer_analysis/
│ │ │ │ ├── description.md
│ │ │ │ ├── label.txt
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ ├── nba_statistics_analysis/
│ │ │ │ ├── description.md
│ │ │ │ ├── label.txt
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── routine_tracker_forum/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── shopping_admin/
│ │ ├── fitness_promotion_strategy/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── ny_expansion_analysis/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── products_sales_analysis/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── sales_inventory_analysis/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── search_filtering_operations/
│ │ ├── description.md
│ │ ├── label.txt
│ │ ├── meta.json
│ │ └── verify.py
│ └── standard/
│ ├── reddit/
│ │ ├── ai_data_analyst/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── budget_europe_travel/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── buyitforlife_research/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── llm_research_summary/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── movie_reviewer_analysis/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── nba_statistics_analysis/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── routine_tracker_forum/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── shopping/
│ │ ├── advanced_product_analysis/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── gaming_accessories_analysis/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── health_routine_optimization/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── holiday_baking_competition/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── multi_category_budget_analysis/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── printer_keyboard_search/
│ │ │ ├── description.md
│ │ │ ├── label.txt
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── running_shoes_purchase/
│ │ ├── description.md
│ │ ├── label.txt
│ │ ├── meta.json
│ │ └── verify.py
│ └── shopping_admin/
│ ├── customer_segmentation_setup/
│ │ ├── description.md
│ │ ├── label.txt
│ │ ├── meta.json
│ │ └── verify.py
│ ├── fitness_promotion_strategy/
│ │ ├── description.md
│ │ ├── label.txt
│ │ ├── meta.json
│ │ └── verify.py
│ ├── marketing_customer_analysis/
│ │ ├── description.md
│ │ ├── label.txt
│ │ ├── meta.json
│ │ └── verify.py
│ ├── ny_expansion_analysis/
│ │ ├── description.md
│ │ ├── label.txt
│ │ ├── meta.json
│ │ └── verify.py
│ ├── products_sales_analysis/
│ │ ├── description.md
│ │ ├── label.txt
│ │ ├── meta.json
│ │ └── verify.py
│ ├── sales_inventory_analysis/
│ │ ├── description.md
│ │ ├── label.txt
│ │ ├── meta.json
│ │ └── verify.py
│ └── search_filtering_operations/
│ ├── description.md
│ ├── label.txt
│ ├── meta.json
│ └── verify.py
├── postgres/
│ ├── easy/
│ │ ├── .gitkeep
│ │ ├── chinook/
│ │ │ ├── customer_data_migration_basic/
│ │ │ │ ├── customer_data.pkl
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── update_employee_info/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── dvdrental/
│ │ │ └── create_payment_index/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── employees/
│ │ │ ├── department_summary_view/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ ├── employee_gender_statistics/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ ├── employee_projects_basic/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── hiring_year_summary/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── lego/
│ │ │ ├── basic_security_setup/
│ │ │ │ ├── description.md
│ │ │ │ ├── meta.json
│ │ │ │ └── verify.py
│ │ │ └── fix_data_inconsistencies/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── sports/
│ │ └── create_performance_indexes/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── standard/
│ ├── chinook/
│ │ ├── customer_data_migration/
│ │ │ ├── customer_data.pkl
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── employee_hierarchy_management/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── sales_and_music_charts/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── dvdrental/
│ │ ├── customer_analysis_fix/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── customer_analytics_optimization/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── film_inventory_management/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── employees/
│ │ ├── employee_demographics_report/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── employee_performance_analysis/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── employee_project_tracking/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── employee_retention_analysis/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── executive_dashboard_automation/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── management_structure_analysis/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── lego/
│ │ ├── consistency_enforcement/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── database_security_policies/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── transactional_inventory_transfer/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ ├── security/
│ │ ├── rls_business_access/
│ │ │ ├── description.md
│ │ │ ├── ground_truth.sql
│ │ │ ├── meta.json
│ │ │ ├── prepare_environment.py
│ │ │ └── verify.py
│ │ └── user_permission_audit/
│ │ ├── description.md
│ │ ├── ground_truth.sql
│ │ ├── meta.json
│ │ ├── prepare_environment.py
│ │ └── verify.py
│ ├── sports/
│ │ ├── baseball_player_analysis/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ ├── participant_report_optimization/
│ │ │ ├── description.md
│ │ │ ├── meta.json
│ │ │ └── verify.py
│ │ └── team_roster_management/
│ │ ├── description.md
│ │ ├── meta.json
│ │ └── verify.py
│ └── vectors/
│ ├── dba_vector_analysis/
│ │ ├── description.md
│ │ ├── ground_truth.sql
│ │ ├── meta.json
│ │ ├── prepare_environment.py
│ │ └── verify.py
│ └── vectors_setup.py
└── utils/
├── __init__.py
├── notion_utils.py
└── postgres_utils.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .dockerignore
================================================
# Git
.git
.gitignore
# Python
__pycache__
*.pyc
*.pyo
*.pyd
.Python
*.egg
*.egg-info/
dist/
build/
.eggs/
*.so
# Virtual environments
venv/
env/
ENV/
.venv/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
.DS_Store
# Environment files (contain secrets)
.env
.mcp_env
notion_state.json
# Test and development files
.pytest_cache/
.coverage
htmlcov/
.tox/
.mypy_cache/
.ruff_cache/
tests/
test_environments/
# Results and logs
results/
*.log
logs/
# PostgreSQL data
.postgres/
# Playwright
playwright-report/
test-results/
# Documentation images
asset/
# Temporary files
*.tmp
tmp/
temp/
# Docker
Dockerfile
docker-compose.yml
.dockerignore
# Node modules (if any locally installed)
node_modules/
# Pixi lock file
pixi.lock
.pixi/
# GitHub state files
github_state/
github_template_repo/
# Backup directories
.mcpbench_backups/
================================================
FILE: .editorconfig
================================================
root = true
; Always use Unix style new lines with new line ending on every file and trim whitespace
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
; Python: PEP8 defines 4 spaces for indentation
[*.py]
indent_style = space
indent_size = 4
================================================
FILE: .gitattributes
================================================
# SCM syntax highlighting & preventing 3-way merges
pixi.lock merge=binary linguist-language=YAML linguist-generated=true
================================================
FILE: .github/ISSUE_TEMPLATE/1_bug_report.yml
================================================
name: '🐛 Bug Report'
description: 'Report an bug'
labels: ['unconfirm']
type: Bug
body:
- type: textarea
attributes:
label: '🐛 Bug Description'
description: A clear and concise description of the bug, if the above option is `Other`, please also explain in detail.
validations:
required: true
- type: textarea
attributes:
label: '📷 Recurrence Steps'
description: A clear and concise description of how to recurrence.
- type: textarea
attributes:
label: '🚦 Expected Behavior'
description: A clear and concise description of what you expected to happen.
- type: textarea
attributes:
label: '📝 Additional Information'
description: If your problem needs further explanation, or if the issue you're seeing cannot be reproduced in a gist, please add more information here.
================================================
FILE: .github/ISSUE_TEMPLATE/2_feature_request.yml
================================================
name: '🌠 Feature Request'
description: 'Suggest an idea'
title: '[Request] '
type: Feature
body:
- type: textarea
attributes:
label: '🥰 Feature Description'
description: Please add a clear and concise description of the problem you are seeking to solve with this feature request.
validations:
required: true
- type: textarea
attributes:
label: '🧐 Proposed Solution'
description: Describe the solution you'd like in a clear and concise manner.
validations:
required: true
- type: textarea
attributes:
label: '📝 Additional Information'
description: Add any other context about the problem here.
================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
contact_links:
- name: Questions and ideas
url: https://github.com/eval-sys/mcpmark/discussions/new/choose
about: Please post questions, and ideas in discussions.
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
#### Change Type
- [ ] ✨ feat
- [ ] 🐛 fix
- [ ] ♻️ refactor
- [ ] 💄 style
- [ ] 👷 build
- [ ] ⚡️ perf
- [ ] 📝 docs
- [ ] 🔨 chore
#### Description of Change
#### Additional Information
================================================
FILE: .github/scripts/pr-comment.js
================================================
/**
* Generate or update PR comment with Docker build info
*/
module.exports = async ({ github, context, dockerMetaJson, image, version, dockerhubUrl, platforms }) => {
const COMMENT_IDENTIFIER = '';
const parseTags = () => {
try {
if (dockerMetaJson) {
const parsed = JSON.parse(dockerMetaJson);
if (Array.isArray(parsed.tags) && parsed.tags.length > 0) {
return parsed.tags;
}
}
} catch (e) {
// ignore parsing error, fallback below
}
if (image && version) {
return [`${image}:${version}`];
}
return [];
};
const generateCommentBody = () => {
const tags = parseTags();
const buildTime = new Date().toISOString();
// Use the first tag as the main version
const mainTag = tags.length > 0 ? tags[0] : `${image}:${version}`;
const tagVersion = mainTag.includes(':') ? mainTag.split(':')[1] : version;
return [
COMMENT_IDENTIFIER,
'',
'### 🐳 Docker Build Completed!',
`**Version**: \`${tagVersion || 'N/A'}\``,
`**Build Time**: \`${buildTime}\``,
'',
dockerhubUrl ? `🔗 View all tags on Docker Hub: ${dockerhubUrl}` : '',
'',
'### Pull Image',
'Download the Docker image to your local machine:',
'',
'```bash',
`docker pull ${mainTag}`,
'```',
'',
'### Run Eval',
'Execute evaluation tasks using the built image:',
'',
'```bash',
`DOCKER_IMAGE_VERSION=${tagVersion} ./run-task.sh --models gpt-4.1-mini --tasks file_context/uppercase`,
'```',
'',
'> [!IMPORTANT]',
'> This build is for testing and validation purposes.',
]
.filter(Boolean)
.join('\n');
};
const body = generateCommentBody();
// List comments on the PR
const { data: comments } = await github.rest.issues.listComments({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
});
const existing = comments.find((c) => c.body && c.body.includes(COMMENT_IDENTIFIER));
if (existing) {
await github.rest.issues.updateComment({
comment_id: existing.id,
owner: context.repo.owner,
repo: context.repo.repo,
body,
});
return { updated: true, id: existing.id };
}
const result = await github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body,
});
return { updated: false, id: result.data.id };
};
================================================
FILE: .github/workflows/publish-docker-image.yml
================================================
name: Publish Docker Image
on:
workflow_dispatch:
release:
types: [ published ]
pull_request:
types: [ synchronize, labeled, unlabeled ]
permissions:
contents: read
pull-requests: write
concurrency:
group: ${{ github.ref }}-${{ github.workflow }}
cancel-in-progress: true
env:
REGISTRY_IMAGE: evalsysorg/mcpmark
PR_TAG_PREFIX: pr-
jobs:
build:
if: |
(github.event_name == 'pull_request' &&
contains(github.event.pull_request.labels.*.name, 'Build Docker')) ||
github.event_name != 'pull_request'
strategy:
matrix:
include:
- platform: linux/amd64
os: ubuntu-latest
- platform: linux/arm64
os: ubuntu-24.04-arm
runs-on: ${{ matrix.os }}
name: Build ${{ matrix.platform }} Image
steps:
- name: Prepare
run: |
platform=${{ matrix.platform }}
echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV
- name: Checkout base
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Generate PR metadata
if: github.event_name == 'pull_request'
id: pr_meta
run: |
branch_name="${{ github.head_ref }}"
sanitized_branch=$(echo "${branch_name}" | sed -E 's/[^a-zA-Z0-9_.-]+/-/g')
echo "pr_tag=${sanitized_branch}-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY_IMAGE }}
tags: |
type=raw,value=${{ env.PR_TAG_PREFIX }}${{ steps.pr_meta.outputs.pr_tag }},enable=${{ github.event_name == 'pull_request' }}
type=semver,pattern={{version}},enable=${{ github.event_name != 'pull_request' }}
type=raw,value=latest,enable=${{ github.event_name != 'pull_request' }}
- name: Docker login
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_REGISTRY_USER }}
password: ${{ secrets.DOCKER_REGISTRY_PASSWORD }}
- name: Get commit SHA
if: github.ref == 'refs/heads/main'
id: vars
run: echo "sha_short=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
- name: Build and export
id: build
uses: docker/build-push-action@v6
with:
platforms: ${{ matrix.platform }}
context: .
file: ./Dockerfile
labels: ${{ steps.meta.outputs.labels }}
build-args: |
SHA=${{ steps.vars.outputs.sha_short }}
outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true
- name: Export digest
run: |
rm -rf /tmp/digests
mkdir -p /tmp/digests
digest="${{ steps.build.outputs.digest }}"
touch "/tmp/digests/${digest#sha256:}"
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
name: digest-${{ env.PLATFORM_PAIR }}
path: /tmp/digests/*
if-no-files-found: error
retention-days: 1
merge:
name: Merge
needs: build
runs-on: ubuntu-latest
steps:
- name: Checkout base
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Download digests
uses: actions/download-artifact@v5
with:
path: /tmp/digests
pattern: digest-*
merge-multiple: true
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Generate PR metadata
if: github.event_name == 'pull_request'
id: pr_meta
run: |
branch_name="${{ github.head_ref }}"
sanitized_branch=$(echo "${branch_name}" | sed -E 's/[^a-zA-Z0-9_.-]+/-/g')
echo "pr_tag=${sanitized_branch}-$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY_IMAGE }}
tags: |
type=raw,value=${{ env.PR_TAG_PREFIX }}${{ steps.pr_meta.outputs.pr_tag }},enable=${{ github.event_name == 'pull_request' }}
type=semver,pattern={{version}},enable=${{ github.event_name != 'pull_request' }}
type=raw,value=latest,enable=${{ github.event_name != 'pull_request' }}
- name: Docker login
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_REGISTRY_USER }}
password: ${{ secrets.DOCKER_REGISTRY_PASSWORD }}
- name: Create manifest list and push
working-directory: /tmp/digests
run: |
docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
$(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *)
- name: Inspect image
run: |
docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }}
- name: Comment on PR with Docker build info
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
const prComment = require('${{ github.workspace }}/.github/scripts/pr-comment.js');
const result = await prComment({
github,
context,
dockerMetaJson: ${{ toJSON(steps.meta.outputs.json) }},
image: "${{ env.REGISTRY_IMAGE }}",
version: "${{ steps.meta.outputs.version }}",
dockerhubUrl: "https://hub.docker.com/r/${{ env.REGISTRY_IMAGE }}/tags",
platforms: "linux/amd64, linux/arm64",
});
core.info(`Status: ${result.updated ? 'Updated' : 'Created'}, ID: ${result.id}`);
================================================
FILE: .gitignore
================================================
logs
.claude
CLAUDE.md
.gemini
results
materials
scripts
!.github/scripts
.nfs*
.mcp_env
.idea
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
logs
logs/*
.DS_Store
notion-sdk-py/
github_state/*
# for playwright cookies
notion_state.json
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
#poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
#pdm.lock
#pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
#pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Cursor
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
# refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
# pixi environments
.pixi
*.egg-info
.postgres
# MCPMark backup directories
.mcpmark_backups/*
test_environments/
postgres_state
================================================
FILE: CHANGELOG.md
================================================
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## v1.2.0 - 2025-09-20
This version includes multiple important feature enhancements, particularly improvements in cost calculation, error handling, and Notion integration. Added per-model cost calculation, comprehensive aggregator functionality, and more robust error recovery mechanisms.
### ✨ Features
- **Add 1m parameter & improve log** (#198) - Added claude-1m-context option and enhanced logging functionality
- **Refine Notion parent resolution and duplicate recovery** (#197) - Improved Notion parent page resolution and duplicate content recovery mechanism
- **Comprehensive aggregator, enable push to new branch** (#185) - Implemented comprehensive aggregator functionality with support for pushing to new branches
- **Support price cost calculating per model** (#186) - Added per-model price cost calculation functionality
- **Improve agent end log** (#183) - Enhanced agent end logging
- **Improve litellm error handling** (#181) - Enhanced LiteLLM error handling mechanism
### ♻️ Refactoring
- **Use notion child block list to locate page** (#196) - Refactored page location logic to use Notion child block list approach
### 🐛 Bug Fixes
- **Fix verification in Notion task company_in_a_box/goals_restructure** (#194) - Fixed verification logic for specific Notion tasks
- **Improve claude error handling** (#195) - Improved error handling for Claude API interactions
- **Fix tailing slash issue for find_legacy_name** - Resolved trailing slash issues in find_legacy_name path handling
- **Recover when duplication lands on parent** (#189) - Fixed recovery mechanism when duplicate content affects parent pages
- **Correctly handle playwright parser** (#184) - Properly handle Playwright parser
- **Handle timeout error, add timeout error for resuming** (#182) - Handle timeout errors and add timeout error handling for resume operations
### 📝 Documentation
- **Better readme, notion language guide** (#190) - Improved README documentation and added comprehensive Notion language guide
### 🔨 Maintenance
- **Update price info** (#188) - Updated pricing information
- **Update desktop_template/file_arrangement/verify.py** (#187) - Maintenance updates to verification scripts
================================================
FILE: Dockerfile
================================================
# MCPMark Docker image with optimized layer caching
# Stage 1: Builder for Python dependencies only
FROM python:3.12-slim AS builder
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
g++ \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /build
# Copy project files needed for pip install
COPY pyproject.toml ./
COPY src/ ./src/
COPY tasks/ ./tasks/
# Install dependencies
RUN pip install --no-cache-dir --user .
# Stage 2: Final image with all runtime dependencies
FROM python:3.12-slim
# Layer 1: Core system dependencies (very stable, rarely changes)
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
# Layer 2: PostgreSQL runtime and client tools (stable, only changes with postgres version)
RUN apt-get update && apt-get install -y --no-install-recommends \
libpq5 \
postgresql-client \
&& rm -rf /var/lib/apt/lists/*
# Layer 3: Git (stable)
RUN apt-get update && apt-get install -y --no-install-recommends \
git \
&& rm -rf /var/lib/apt/lists/*
# Layer 4: Playwright system dependencies (changes with browser requirements)
RUN apt-get update && apt-get install -y --no-install-recommends \
libnss3 \
libnspr4 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libcups2 \
libdrm2 \
libxkbcommon0 \
libatspi2.0-0 \
libx11-6 \
libxcomposite1 \
libxdamage1 \
libxfixes3 \
libxrandr2 \
libgbm1 \
libxcb1 \
libpango-1.0-0 \
libcairo2 \
libasound2 \
&& rm -rf /var/lib/apt/lists/*
# Layer 5: Download tools and Node.js (changes with Node version)
RUN apt-get update && \
apt-get install -y --no-install-recommends curl wget unzip && \
curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \
apt-get install -y --no-install-recommends nodejs && \
apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/*
# Layer 6: pipx (rarely changes)
RUN pip install --no-cache-dir pipx && \
pipx ensurepath
# Layer 7: Copy Python packages from builder (changes with dependencies)
COPY --from=builder /root/.local /root/.local
# Layer 8: Playwright browsers (changes with browser versions)
RUN python3 -m playwright install chromium && \
npx -y playwright install chromium
# Layer 9: Install PostgreSQL MCP server (Python, used via `pipx run postgres-mcp`)
RUN pipx install postgres-mcp
# Set working directory
WORKDIR /app
# Layer 9: Create directory structure (rarely changes)
RUN mkdir -p /app/results
# Layer 10: Application code (changes frequently)
COPY . .
# Set environment
ENV PATH="/root/.local/bin:/root/.local/pipx/venvs/*/bin:${PATH}"
ENV PYTHONPATH="/app"
ENV PLAYWRIGHT_BROWSERS_PATH=/root/.cache/ms-playwright
ENV PIPX_HOME=/root/.local/pipx
ENV PIPX_BIN_DIR=/root/.local/bin
# Default command
CMD ["python3", "-m", "pipeline", "--help"]
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# MCPMark: Stress-Testing Comprehensive MCP Use
[](https://mcpmark.ai)
[](https://arxiv.org/abs/2509.24002)
[](https://discord.gg/HrKkJAxDnA)
[](https://mcpmark.ai/docs)
[](https://huggingface.co/datasets/Jakumetsu/mcpmark-trajectory-log)
An evaluation suite for agentic models in real MCP tool environments (Notion / GitHub / Filesystem / Postgres / Playwright).
MCPMark provides a reproducible, extensible benchmark for researchers and engineers: one-command tasks, isolated sandboxes, auto-resume for failures, unified metrics, and aggregated reports.
[](https://mcpmark.ai)
## News
- 📌 **21 Jan** — Pinned MCP server versions for reproducible benchmarks: GitHub MCP Server `v0.15.0` (switched to Docker for version control), Notion MCP Server `@1.9.1` (Notion released 2.0 but it has many bugs, not recommended). See [#246](https://github.com/eval-sys/mcpmark/pull/246).
- 🔥 **13 Dec** — Added auto-compaction support (`--compaction-token`) to summarize long conversations and avoid context overflow during evaluation ([#236](https://github.com/eval-sys/mcpmark/pull/236])).
- 🏅 **02 Dec** — Evaluated `gemini-3-pro-preview` (thinking: low): **Pass@1 50.6%** ± 2.3% — so close to `gpt-5-high` (51.6%)! Also `deepseek-v3.2-thinking` 36.8% and `deepseek-v3.2-chat` 29.7%
- 🔥 **02 Dec** — Obfuscate GitHub @mentions to prevent notification spam during evaluation ([#229](https://github.com/eval-sys/mcpmark/pull/229))
- 🏅 **01 Dec** — DeepSeek v3.2 uses MCPMark! Kudos on securing the best open-source model. [X Post](https://x.com/deepseek_ai/status/1995452650557763728) | [Technical Report](https://huggingface.co/deepseek-ai/DeepSeek-V3.2/resolve/main/assets/paper.pdf)
- 🔥 **17 Nov** — Added 50 easy tasks (10 per MCP server) for smaller open-source models ([#225](https://github.com/eval-sys/mcpmark/pull/225))
- 🤝 **31 Oct** — Community PR from insforge: better MCP servers achieve better results with fewer tokens! ([#214](https://github.com/eval-sys/mcpmark/pull/214))
- 🔥 **13 Oct** — Added ReAct agent support. PRs for new agent scaffolds welcome! ([#209](https://github.com/eval-sys/mcpmark/pull/209))
- 🏅 **10 Sep** — `qwen-3-coder-plus` is the best open-source model! Kudos to Qwen team. [X Post](https://x.com/Alibaba_Qwen/status/1965457023438651532)
---
## What you can do with MCPMark
- **Evaluate real tool usage** across multiple MCP services: `Notion`, `GitHub`, `Filesystem`, `Postgres`, `Playwright`.
- **Use ready-to-run tasks** covering practical workflows, each with strict automated verification.
- **Reliable and reproducible**: isolated environments that do not pollute your accounts/data; failed tasks auto-retry and resume.
- **Unified metrics and aggregation**: single/multi-run (pass@k, avg@k, etc.) with automated results aggregation.
- **Flexible deployment**: local or Docker; fully validated on macOS and Linux.
---
## Quickstart (5 minutes)
### 1) Clone the repository
```bash
git clone https://github.com/eval-sys/mcpmark.git
cd mcpmark
```
### 2) Set environment variables (create `.mcp_env` at repo root)
Only set what you need. Add service credentials when running tasks for that service.
```env
# Example: OpenAI
OPENAI_BASE_URL="https://api.openai.com/v1"
OPENAI_API_KEY="sk-..."
# Optional: Notion (only for Notion tasks)
SOURCE_NOTION_API_KEY="your-source-notion-api-key"
EVAL_NOTION_API_KEY="your-eval-notion-api-key"
EVAL_PARENT_PAGE_TITLE="MCPMark Eval Hub"
PLAYWRIGHT_BROWSER="chromium" # chromium | firefox
PLAYWRIGHT_HEADLESS="True"
# Optional: GitHub (only for GitHub tasks)
GITHUB_TOKENS="token1,token2" # token pooling for rate limits
GITHUB_EVAL_ORG="your-eval-org"
# Optional: Postgres (only for Postgres tasks)
POSTGRES_HOST="localhost"
POSTGRES_PORT="5432"
POSTGRES_USERNAME="postgres"
POSTGRES_PASSWORD="password"
```
See `docs/introduction.md` and the service guides below for more details.
### 3) Install and run a minimal example
Local (Recommended)
```bash
pip install -e .
# If you'll use browser-based tasks, install Playwright browsers first
playwright install
```
MCPMark defaults to the built-in orchestration agent (`MCPMarkAgent`). To experiment with the ReAct-style agent, pass `--agent react` to `pipeline.py` (other settings stay the same).
Docker
```bash
./build-docker.sh
```
Run a filesystem task (no external accounts required):
```bash
python -m pipeline \
--mcp filesystem \
--k 1 \ # run once to quick start
--models gpt-5 \ # or any model you configured
--tasks file_property/size_classification
# Add --task-suite easy to run the lightweight dataset (where available)
```
Results are saved to `./results/{exp_name}/{model}__{mcp}/run-*/...` for the standard suite and `./results/{exp_name}/{model}__{mcp}-easy/run-*/...` when you run `--task-suite easy` (e.g., `./results/test-run/gpt-5__filesystem/run-1/...` or `./results/test-run/gpt-5__github-easy/run-1/...`).
---
## Run your evaluations
### Task suites (standard vs easy)
- Each MCP service now stores tasks under `tasks/////`.
- `standard` (default) covers the full benchmark (127 tasks today).
- `easy` hosts 10 lightweight tasks per MCP, ideal for smoke tests and CI (GitHub’s are already available under `tasks/github/easy`).
- Switch suites with `--task-suite easy` (defaults to `--task-suite standard`).
### Single run (k=1)
```bash
# Run ALL tasks for a service
python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL --k 1
# Run a task group
python -m pipeline --exp-name exp --mcp notion --tasks online_resume --models MODEL --k 1
# Run a specific task
python -m pipeline --exp-name exp --mcp notion --tasks online_resume/daily_itinerary_overview --models MODEL --k 1
# Evaluate multiple models
python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL1,MODEL2,MODEL3 --k 1
```
### Multiple runs (k>1) for pass@k
```bash
# Run k=4 to compute stability metrics (requires --exp-name to aggregate final results)
python -m pipeline --exp-name exp --mcp notion --tasks all --models MODEL
# Aggregate results (pass@1 / pass@k / pass^k / avg@k)
python -m src.aggregators.aggregate_results --exp-name exp
```
### Run with Docker
```bash
# Run all tasks for a service
./run-task.sh --mcp notion --models MODEL --exp-name exp --tasks all
# Cross-service benchmark
./run-benchmark.sh --models MODEL --exp-name exp --docker
```
Please visit `docs/introduction.md` for choices of *MODEL*.
Tip: MCPMark supports **auto-resume**. When re-running, only unfinished tasks will execute. Failures matching our retryable patterns (see [RETRYABLE_PATTERNS](src/errors.py)) are retried automatically. Models may emit different error strings—if you encounter a new resumable error, please open a PR or issue.
Tip: MCPMark supports **auto-compaction**; pass `--compaction-token N` to enable automatic context summarization when prompt tokens reach `N` (use `999999999` to disable).
---
## Service setup and authentication
| Service | Setup summary | Docs |
|-------------|-----------------------------------------------------------------------------------------------------------------|---------------------------------------|
| Notion | Environment isolation (Source Hub / Eval Hub), integration creation and grants, browser login verification. | [Guide](docs/mcp/notion.md) |
| GitHub | Multi-account token pooling recommended; import pre-exported repo state if needed. | [Guide](docs/mcp/github.md) |
| Postgres | Start via Docker and import sample databases. | [Setup](docs/mcp/postgres.md) |
| Playwright | Install browsers before first run; defaults to `chromium`. | [Setup](docs/mcp/playwright.md) |
| Filesystem | Zero-configuration, run directly. | [Config](docs/mcp/filesystem.md) |
You can also follow [Quickstart](docs/quickstart.md) for the shortest end-to-end path.
### Important Notice: GitHub Repository Privacy
> **Please ensure your evaluation repositories are set to PRIVATE.**
GitHub state templates are now automatically downloaded from our CDN during evaluation — no manual download is required. However, because these templates contain issues and pull requests from real open-source repositories, the recreation process includes `@username` mentions of the original authors.
**We have received feedback from original GitHub authors who were inadvertently notified** when evaluation repositories were created as public. To be a responsible member of the open-source community, we urge all users to:
1. **Always keep evaluation repositories private** during the evaluation process.
2. **In the latest version**, we have added random suffixes to all `@username` mentions (e.g., `@user` becomes `@user_x7k2`) and implemented a safety check that prevents importing templates to public repositories.
3. **If you are using an older version of MCPMark**, please either:
- Pull the latest code immediately, or
- Manually ensure all GitHub evaluation repositories are set to private.
Thank you for helping us maintain a respectful relationship with the open-source community.
---
## Results and metrics
- Results are organized under `./results/{exp_name}/{model}__{mcp}/run-*/` (JSON + CSV per task).
- Generate a summary with:
```bash
# Basic usage
python -m src.aggregators.aggregate_results --exp-name exp
# For k-run experiments with single-run models
python -m src.aggregators.aggregate_results --exp-name exp --k 4 --single-run-models claude-opus-4-1
```
- Only models with complete results across all tasks and runs are included in the final summary.
- Includes multi-run metrics (pass@k, pass^k) for stability comparisons when k > 1.
---
## Model and Tasks
- **Model support**: MCPMark calls models via LiteLLM — see the LiteLLM docs: [`LiteLLM Doc`](https://docs.litellm.ai/docs/). For Anthropic (Claude) extended thinking mode (enabled via `--reasoning-effort`), we use Anthropic’s native API.
- See `docs/introduction.md` for details and configuration of supported models in MCPMark.
- To add a new model, edit `src/model_config.py`. Before adding, check LiteLLM supported models/providers. See [`LiteLLM Doc`](https://docs.litellm.ai/docs/).
- Task design principles in `docs/datasets/task.md`. Each task ships with an automated `verify.py` for objective, reproducible evaluation, see `docs/task.md` for details.
---
## Contributing
Contributions are welcome:
1. Add a new task under `tasks/////` with `meta.json`, `description.md` and `verify.py`.
2. Ensure local checks pass and open a PR.
3. See `docs/contributing/make-contribution.md`.
---
## Citation
If you find our works useful for your research, please consider citing:
```bibtex
@misc{wu2025mcpmark,
title={MCPMark: A Benchmark for Stress-Testing Realistic and Comprehensive MCP Use},
author={Zijian Wu and Xiangyan Liu and Xinyuan Zhang and Lingjun Chen and Fanqing Meng and Lingxiao Du and Yiran Zhao and Fanshi Zhang and Yaoqi Ye and Jiawei Wang and Zirui Wang and Jinjie Ni and Yufan Yang and Arvin Xu and Michael Qizhe Shieh},
year={2025},
eprint={2509.24002},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2509.24002},
}
```
## License
This project is licensed under the Apache License 2.0 — see `LICENSE`.
================================================
FILE: build-docker.sh
================================================
#!/bin/bash
# Build Docker image for MCPMark
set -e
# Color codes for output
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
echo -e "${YELLOW}Building MCPMark Docker image locally...${NC}"
# Build the Docker image with the same tag as Docker Hub for local testing
docker build -t evalsysorg/mcpmark:latest . "$@"
# Check if build was successful
if [ $? -eq 0 ]; then
echo -e "${GREEN}✓ Docker image built successfully${NC}"
echo " Tag: evalsysorg/mcpmark:latest"
# Show image info
echo ""
echo "Image details:"
docker images evalsysorg/mcpmark:latest --format "table {{.Repository}}\t{{.Tag}}\t{{.Size}}\t{{.CreatedAt}}"
echo ""
echo "You can now run tasks using:"
echo " ./run-task.sh --mcp notion --models o3 --exp-name test --tasks all"
else
echo "Docker build failed!"
exit 1
fi
================================================
FILE: cspell.config.yaml
================================================
version: "0.2"
ignorePaths: []
dictionaryDefinitions: []
dictionaries: []
words:
- datname
- domcontentloaded
- modelcontextprotocol
- pgdumplib
- pixi
- pypi
- topbar
- usename
ignoreWords: []
import: []
================================================
FILE: docs/contributing/make-contribution.md
================================================
# Contributing
1. Fork the repository and create a feature branch.
2. Add new tasks under `tasks/////` with the files of `meta.json`, `description.md` and `verify.py`. Please refer to [Task Page](../datasets/task.md) for detailed instructions.
3. Ensure all tests pass.
4. Submit a pull request — contributions are welcome!
================================================
FILE: docs/datasets/task.md
================================================
# Task
The tasks in MCPMark follows two major principles
- The tasks are based on realistic digital environments that are also used by human programmers.
- The task outcome can be robustly verified in python scripts.
Therefore, each MCPMark task consists of three files
- `meta.json`
- `description.md`
- `verify.py`
Here, `metadata.json` includes the meta information of the task, `description.md` describes the purpose and setting of the task, as well as the instruction to complete the task. `verify.py` checks whether the task is completed successfully.
For example, you can ask the model agent to create a file with specific name and write specific content to the file, which belongs to the category of operating the file context. The structure looks like
```
tasks
│
└───filesystem
│
└───standard # task_suite (also supports `easy`)
│
└───file_context # category_id
│
└───create_file_write
│ meta.json
│ description.md
│ verify.py
```
All tasks live under `tasks/////`. `filesystem` refers to the MCP service and `task_suite` captures the difficulty slice (`standard` benchmark vs `easy` smoke tests).
`meta.json` includes the meta information about the task, including the following key
- task_id: the id of the task.
- task_name: full name of the task.
- description: task description.
- category_id: the id of task category.
- category_name: the full name of task categeory.
- author: the author of the task.
- difficulty: the task difficulty level.
- created_at: the timestamp of task creation.
- tags: a list of tags that describe the task.
- mcp: a list of MCP services it belongs to.
- metadata: other meta information.
Here `category_name` describes the shared feature or the environment across different tasks (e.g. the github repository or notion page the task is built on). In this running example, `category_name` refers to `file_context`.
`description.md` could include the following information
- Task name
- Create and Write File.
- Task description
- Use the filesystem MCP tools to create a new file and write content to it.
- Task Objectives
- Create a new file named `hello_world.txt` in the test directory.
- Write the following content to the file: ``` Hello, World```
- Verify the file was created successfully
- Verification Criteria
- File `hello_world.txt` exists in the test directory
- File contains the expected content structure
- File includes "Hello, World!" on the first line
- Tips
- Use the `write_file` tool to create and write content to the file
- The test directory path will be provided in the task context
The entire content of `description.md` will be read by the model agent for completing the task.
Accordingly, the `verify.py` contains the following functionalities
- Check whether the target directory exists. [](https://postimg.cc/4nnLrw3M)
- Check whether the target directory contains the file with target file name. [](https://postimg.cc/7fGRTX87)
- Check whether the target file contains the desired content `EXPECTED_PATTERNS = ["Hello Wolrd"]`. [](https://postimg.cc/w7ZSWZc0)
- If the outcome passes **all the above verification functionalities**, the task would be marked as successfully completed.
================================================
FILE: docs/installation_and_docker_usage.md
================================================
# Installation and Docker Task Usage Guideline
## Overview
The MCPMark setup supports installation through either pip or MCPMark Docker (recommended) after cloning the code repository.
### Pip Installtion
```bash
pip install -e .
```
The MCPMark Docker setup provides a simple way to run evaluation tasks in isolated containers. PostgreSQL is automatically handled when needed.
## 1. Quick Start
### 1.1 Docker Image
The official Docker image is automatically pulled from Docker Hub on first use.
The image is hosted at: https://hub.docker.com/r/evalsysorg/mcpmark
**Image Management:**
- The scripts automatically download the image when it's not found locally
- To manually update to the latest version:
```bash
docker pull evalsysorg/mcpmark:latest
```
- For local development/testing, you can build your own docker:
```bash
# Creates evalsysorg/mcpmark:latest locally
./build-docker.sh
```
## 2. Running MCP Experiments
### 2.1 Running Individual MCP Experiment
The `run-task.sh` script provides simplified Docker usage:
```bash
# Run filesystem tasks (filesystem is the default mcp service)
./run-task.sh --models MODEL_NAME --k K
# Run github/notion/postgres/playwright/playwright_webarena with specific task
./run-task.sh --mcp MCPSERVICE --models MODEL_NAME --exp-name EXPNAME --tasks TASK --k K
```
where *MODEL_NAME* refers to the model choice from the supported models (see [Introduction Page](./introduction.md) for more information), *EXPNAME* refers to customized experiment name, *TASK* refers to specific task or task group (see `tasks///...` for more information), *K* refers to the time of independent experiments.
Additionally, the `run-benchmark.sh` script evaluates models across all MCP services:
```bash
# Run all services with Docker (recommended)
./run-benchmark.sh --models MODEL --exp-name EXPNAME --docker
# Run specific services
./run-benchmark.sh --models MODEL --exp-name EXPNAME --mcps MCPSERVICES --docker
# Run with parallel execution for faster results
./run-benchmark.sh --models MODEL --exp-name EXPNAME --docker --parallel
# Run locally without Docker
./run-benchmark.sh --models MODEL --exp-name EXPNAME --mcps MCPSERVICES
```
Here *MCPSERVICES* refers to group of MCP services, separated by comma (e.g. *filesystem,postgres*)
The benchmark script:
- Runs all or selected MCP services automatically
- Supports progress tracking and timing
- Generates summary reports and logs
- Supports parallel service execution
- Continues running even if some services fail
- Automatically generates performance dashboards
### Manual Docker Commands
#### For Non-Postgres Services
Suppose Notion is the service:
```bash
# Build the image first
./build-docker.sh
# Run a task
docker run --rm \
-v $(pwd)/results:/app/results \
-v $(pwd)/.mcp_env:/app/.mcp_env:ro \
-v $(pwd)/notion_state.json:/app/notion_state.json:ro \
evalsysorg/mcpmark:latest \
python3 -m pipeline --mcp notion --models MODEL --exp-name EXPNAME --tasks TASK --k K
```
#### For Postgres Service
```bash
# The run-task.sh script handles postgres automatically, but if doing manually:
# Start postgres container
docker run -d \
--name mcp-postgres \
--network mcp-network \
-e POSTGRES_DATABASE=postgres \
-e POSTGRES_USER=postgres \
-e POSTGRES_PASSWORD=123456 \
ghcr.io/cloudnative-pg/postgresql:17-bookworm
# Run postgres task
docker run --rm \
--network mcp-network \
-e POSTGRES_HOST=mcp-postgres \
-v $(pwd)/results:/app/results \
-v $(pwd)/.mcp_env:/app/.mcp_env:ro \
evalsysorg/mcpmark:latest \
python3 -m pipeline --mcp postgres --models MODEL --exp-name EXPNAME --tasks TASK --k K
# Stop and remove postgres when done
docker stop mcp-postgres && docker rm mcp-postgres
```
## Script Usage
### Benchmark Runner (`run-benchmark.sh`)
```
./run-benchmark.sh --models MODELS --exp-name NAME [OPTIONS]
Required Options:
--models MODELS Comma-separated list of models to evaluate
--exp-name NAME Experiment name for organizing results
Optional Options:
--docker Run tasks in Docker containers (recommended)
--mcps SERVICES Comma-separated list of services to test
Default: filesystem,notion,github,postgres,playwright
--parallel Run services in parallel (experimental)
--timeout SECONDS Timeout per task in seconds (default: 300)
```
### Individual Task Runner (`run-task.sh`)
```
./run-task.sh [--mcp SERVICE] [PIPELINE_ARGS]
Options:
--mcp SERVICE MCP service (notion|github|filesystem|playwright|postgres)
Default: filesystem
Environment Variables:
DOCKER_MEMORY_LIMIT Memory limit for container (default: 4g)
DOCKER_CPU_LIMIT CPU limit for container (default: 2)
DOCKER_IMAGE_VERSION Docker image tag to use (default: latest)
All other arguments are passed directly to the pipeline command.
Pipeline arguments (see python3 -m pipeline --help):
--mcp {notion,github,filesystem,playwright,postgres,playwright_webarena}
MCP service to use (default: filesystem)
--models MODELS Comma-separated list of models to evaluate (e.g., 'o3,k2,gpt-4.1')
--tasks TASKS Tasks to run: "all", a category name, or "category/task_name"
--exp-name EXP_NAME Experiment name; results are saved under results// (default: YYYY-MM-DD-HH-MM-SS)
--k K Number of evaluation runs for pass@k metrics (default: 1)
--timeout TIMEOUT Timeout in seconds for each task
--output-dir OUTPUT_DIR
Directory to save results
```
## Docker Benefits
1. **Efficiency**: Only starts necessary containers
2. **Isolation**: Each task runs in a fresh container
3. **Resource Management**: Automatic cleanup of containers and networks
4. **Smart Dependencies**: PostgreSQL only starts for postgres service
5. **Parallel Support**: Can run multiple services simultaneously for faster benchmarks
6. **Comprehensive Testing**: Benchmark script runs all services with one command
7. **Progress Tracking**: Colored output with timing and status information
8. **Automatic Reporting**: Generates summary reports and performance dashboards
## Common Troubleshooting
### Permission Issues
```bash
chmod +x run-task.sh
```
### Docker Build Issues
```bash
# Force rebuild with no cache
./run-task.sh --build --mcp MCPSERVICE --models MODEL_NAME --exp-name EXPNAME --tasks TASK
```
### PostgreSQL Connection Issues
```bash
# Check if postgres is running
docker ps | grep postgres
# View postgres logs
docker logs mcp-postgres-task
```
### Cleanup Stuck Resources
```bash
# Stop all containers
docker stop $(docker ps -q)
# Remove task network
docker network rm mcp-task-network
# Remove postgres data volume (careful!)
docker volume rm mcp-postgres-data
```
## Environment Variables
Create `.mcp_env` file with your credentials:
```env
# Service credentials
SOURCE_NOTION_API_KEY=your-key
EVAL_NOTION_API_KEY=your-key
GITHUB_TOKEN=your-token
POSTGRES_PASSWORD=your-password
# Model API keys
OPENAI_API_KEY=your-key
ANTHROPIC_API_KEY=your-key
# ... etc
```
Please refer to [Quick Start](./quickstart.md) for setting up API key for specific model.
## Docker Compose Files
- `docker-compose.yml` - Full stack with postgres (for development/testing)
## Notes
- Results are saved under `./results//`.
- Each task runs in an ephemeral container.
- Docker image is shared across all tasks.
- PostgreSQL data persists in Docker volume.
================================================
FILE: docs/introduction.md
================================================
# MCPMark
MCPMark is a comprehensive evaluation suite for evaluating the agentic ability of frontier models.
MCPMark includes Model Context Protocol (MCP) service in following environments
- Notion
- Github
- Filesystem
- Postgres
- Playwright
- Playwright-WebArena
### General Procedure
MCPMark is designed to run agentic tasks in complex environment **safely**. Specifically, it sets up an isolated environment for the experiment, completing the task, and then destroy the environment without affecting existing user profile or information.
### How to Use MCPMark
1. MCPMark Installation.
2. Authorize service (for Github and Notion).
3. Configure the environment variables in `.mcp_env`.
4. Run MCPMark experiment.
Please refer to [Quick Start](./quickstart.md) for details regarding how to start a sample filesystem experiment in properly, and [Task Page](./datasets/task.md) for task details. Please visit [Installation and Docker Uusage](./installation_and_docker_usage.md) information of full MCPMark setup.
### Running MCPMark
MCPMark supports the following mode to run experiments (suppose the experiment is named as new_exp, and the model used are o3 and gpt-4.1 and the environment is notion), with K repetive experiments.
#### MCPMark in Pip Installation
```bash
# Evaluate ALL tasks
python -m pipeline --exp-name new_exp --mcp notion --tasks all --models o3 --k K
# Evaluate a single task group (online_resume)
python -m pipeline --exp-name new_exp --mcp notion --tasks online_resume --models o3 --k K
# Evaluate one specific task (task_1 in online_resume)
python -m pipeline --exp-name new_exp --mcp notion --tasks online_resume/task_1 --models o3 --k K
# Evaluate multiple models
python -m pipeline --exp-name new_exp --mcp notion --tasks all --models o3,gpt-4.1 --k K
```
#### MCPMark in Docker Installation
```bash
# Run all tasks for one service
./run-task.sh --mcp notion --models o3 --exp-name new_exp --tasks all
# Run comprehensive benchmark across all services
./run-benchmark.sh --models o3,gpt-4.1 --exp-name new_exp --docker
```
#### Experiment Auto-Resume
For re-run experiments, only unfinished tasks will be executed. Tasks that previously failed due to pipeline errors (such as State Duplication Error or MCP Network Error) will also be retried automatically.
### Results
The experiment results are written to `./results/` (JSON + CSV).
#### Reult Aggregation (for K > 1)
MCP supports aggreated metrics of pass@1, pass@K, pass^K, avg@K.
```bash
python -m src.aggregators.aggregate_results --exp-name new_exp
```
### Model Support
MCPMark supports the following models with according providers (model codes in the brackets).
#### OpenAI
- GPT-5 (gpt-5)
- o3 (o3)
#### Anthropic
- Claude-4.1-Opus (claude-4.1-opus)
- Claude-4-Sonnet (claude-4-sonnet)
#### Google
- Gemini-2.5-Pro (gemini-2.5-pro)
#### Grok
- Grok-4 (grok-4)
#### Deepseek
- DeepSeek-Chat (deepseek-chat)
#### Alibaba
- Qwen3-Coder (qwen-3-coder)
#### Kimi
- Kimi-K2 (k2)
### Want to contribute?
Visit [Contributing Page](./contributing) to learn how to make contribution to MCPMark.
================================================
FILE: docs/mcp/filesystem.md
================================================
# Filesystem
This guide walks you through preparing your filesystem environment for MCPMark.
## 1 · Configure Environment Variables
Set the `FILESYSTEM_TEST_ROOT` environment variable in your `.mcp_env` file:
```env
## Filesystem
FILESYSTEM_TEST_ROOT=./test_environments
```
**Recommended**: Use `FILESYSTEM_TEST_ROOT=./test_environments` (relative to project root)
---
## 2 · Automatic Test Environment Download
Our code automatically downloads test folders to your specified `FILESYSTEM_TEST_ROOT` directory when the pipeline starts running.
**Downloaded Structure**:
```
./test_environments/
├── desktop/ # Desktop environment
├── desktop_template/ # Template files for desktop
├── file_context/ # File content understanding tasks
├── file_property/ # File metadata and properties related tasks
├── folder_structure/ # Directory organization tasks
├── legal_document/ # Legal document processing
├── papers/ # Academic paper tasks
├── student_database/ # Database management tasks
├── threestudio/ # 3D Generation codebase
└── votenet/ # 3D Object Detection codebase
```
---
## 3 · Running Filesystem Tasks
**Basic Command**:
```bash
python -m pipeline --exp-name EXPNAME --mcp filesystem --tasks FILESYSTEMTASK --models MODEL --k K
```
**Docker Usage (Recommended)**
Docker is recommended to avoid library version conflicts:
```bash
# Build Docker image
./build-docker.sh
# Run with Docker
./run-task.sh --mcp filesystem --models MODEL --exp-name EXPNAME --tasks FILESYSTEMTASK --k K
```
Here *EXPNAME* refers to customized experiment name, *FILESYSTEMTASK* refers to the github task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.
---
## 5 · Troubleshooting
**Common Issues**:
- **Test Environment Not Found**: Ensure `FILESYSTEM_TEST_ROOT` is set correctly
- **Prerequisites**: Make sure your terminal has `wget` and `unzip` commands available
- **Recommended**: Use Docker to prevent library version conflicts
================================================
FILE: docs/mcp/github.md
================================================
# GitHub
This guide walks you through preparing your GitHub environment for MCPMark and authenticating the CLI tools with support for **token pooling** to mitigate rate limits.
## 1 · Prepare An Evaluation Organization in Github
1. **Create a free GitHub Organization**
- In GitHub, click your avatar → **Your organizations** → **New organization**.
- We recommend a name like `mcpmark-eval-xxx`. (Check if there is a conflict with other organization names.)
- This keeps all benchmark repositories isolated from your personal and work code.
- [](https://postimg.cc/k27xdXc4)
2. **Create Multiple GitHub Accounts (Recommended for Rate Limit Relief)**
To effectively distribute API load and avoid rate limiting, we recommend creating **2-4 additional GitHub accounts**:
- Create new GitHub accounts (e.g., `your-name-eval-1`, `your-name-eval-2`, etc.)
- **Important**: Add all these accounts as **Owners** to your evaluation organization
- This allows the token pooling system to distribute requests across multiple accounts
3. **Generate Fine-Grained Personal Access Tokens (PATs) for Each Account**
**Repeat the following process for each GitHub account (including your main account):**
- Navigate to *Settings → Developer settings → Personal access tokens → Fine-grained tokens*
- Click **Generate new token**, select the evaluation organization you created
- [](https://postimg.cc/Mv9yqJrm)
- Give the token a descriptive name (e.g., *MCPMark Eval Token - Account 1*)
- Under **Repository permissions** and **Organization permissions**, enable **All permissions** (read and write if applicable)
- [](https://postimg.cc/14HFrZP1)
- Copy the generated token and save it safely — you'll need all tokens for the next step
4. **Configure Token Pooling in `.mcp_env`**
In your project root, edit (or create) the `.mcp_env` file and add your tokens:
**For single token (Basic setup):**
```env
## GitHub - Single Token Configuration
GITHUB_TOKENS="your-single-token-here"
GITHUB_EVAL_ORG="your-eval-org-name"
```
**For multiple tokens (Recommended for handling rate limits):**
```env
## GitHub - Token Pooling Configuration
GITHUB_TOKENS="token1,token2,token3,token4"
GITHUB_EVAL_ORG="your-eval-org-name"
```
**Important Notes:**
- Replace `token1,token2,token3,token4` with your actual tokens (comma-separated, no spaces)
- **2-4 tokens** is recommended for optimal rate limit distribution
- All tokens must have **the same permissions** on the evaluation organization
- The system automatically rotates between tokens to distribute API load
---
## 2 · Download the Sample Repository State
We have pre-exported several popular open-source repositories along with curated Issues and PRs.
1. Download the archive from [Google Drive](https://drive.google.com/drive/folders/16bFDjdtqJYzYJlqKcjKBGomo8DwOhWcN?usp=drive_link).
2. Extract it so that the directory `./github_state/` appears in the project root:
```bash
mkdir -p github_state
unzip github_state.zip -d ./github_state
```
---
## 3 · Add New Repositories (Optional)
If you want to benchmark additional repositories:
1. Export the desired repository state:
```bash
python -m src.mcp_services.github.repo_exporter --source_repo_url owner/name --max-issues 20 --max-pulls 5
```
2. Open `src/mcp_services/github/state_manager.py` and add a new entry to `self.initial_state_mapping` pointing to the exported folder.
---
## 4 · GitHub Rate Limits & Token Pooling Benefits
### Understanding Rate Limits
Fine-grained tokens are subject to GitHub API rate limits:
- **Read operations**: 5,000 requests per hour per token
- **General write operations**: 80 writes per minute and 500 writes per hour per token
- **Content creation (Issues, PRs, Comments)**: **500 requests per hour per token** (Secondary Rate Limit)
### How Token Pooling Helps
With **token pooling**, MCPMark automatically:
- **Distributes requests** across multiple tokens to multiply your rate limits
- **Rotates tokens** for each task execution to balance load
- **Handles rate limit failures** by trying the next available token
- **Ensures consistency** between agent execution and verification
### Example: Rate Limit Multiplication
**Read Operations:**
- **Single token**: 5,000 requests/hour
- **4 tokens**: ~20,000 requests/hour total capacity
**Content Creation (Critical for MCPMark):**
- **Single token**: 500 content creation requests/hour
- **4 tokens**: ~2,000 content creation requests/hour total capacity
- **Automatic failover**: If one token hits limits, others continue working
This dramatically improves evaluation performance, especially for large task batches or frequent testing cycles. **The content creation limit is often the bottleneck**, making token pooling essential for efficient evaluations.
### Repository Limits
MCPMark places a cap on the number of PRs and issues (≤ 50 in total) per repository to ensure reasonable evaluation times and to stay within rate limits.
## 2. Running Github Tasks
1. Configure environment variables: make sure `GITHUB_TOKENS` and `GITHUB_EVAL_ORG` are properly set in `.mcp_env`.
2. For single task or task group, run
```bash
python -m pipeline --exp-name EXPNAME --mcp github --tasks GITHUBTASK --models MODEL --k K
```
Here *EXPNAME* refers to customized experiment name, *GITHUBTASK* refers to the github task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.
================================================
FILE: docs/mcp/notion.md
================================================
# Notion
This guide walks you through preparing your Notion environment for MCPMark and authenticating the CLI tools.
> Note: Set your Notion app and workspace interface language to English. We use Playwright for browser automation and our locator logic relies on raw English text in the UI. Non-English interfaces can cause element selection to fail.
## 1 · Set up Notion Environment
1. **Duplicate the MCPMark Source Pages**
Copy the template database and pages into your workspace from the public template following this tutorial:
[Duplicate MCPMark Source](https://painted-tennis-ebc.notion.site/MCPBench-Source-Hub-23181626b6d7805fb3a7d59c63033819).
2. **Set up the Source and Eval Hub for Environment Isolation**
- Prepare **two separate Notion pages**:
- **Source Hub**: Stores all the template databases/pages. Managed by `SOURCE_NOTION_API_KEY`.
- **Eval Hub**: Only contains the duplicated templates for the current evaluation. Managed by `EVAL_NOTION_API_KEY`.
- In Notion, create an **empty page** in your Eval Hub. The page name **must exactly match** the value you set for `EVAL_PARENT_PAGE_TITLE` in your environment variables (e.g., `MCPMark Eval Hub`).
- Name your **Source Hub** page to match `SOURCE_PARENT_PAGE_TITLE` (default: `MCPMark Source Hub`). This is where all initial-state templates live; we enumerate this page’s first-level children by exact title.
- In Notion's **Connections** settings:
- Bind the integration corresponding to `EVAL_NOTION_API_KEY` to the Eval Hub parent page you just created.
- Bind the integration corresponding to `SOURCE_NOTION_API_KEY` to your Source Hub (where the templates are stored).
3. **Create Notion Integrations & Grant Access**
a. Visit [Notion Integrations](https://www.notion.so/profile/integrations) and create **two internal integrations** (one for Source Hub, one for Eval Hub).
b. Copy the generated **Internal Integration Tokens** (these will be your `SOURCE_NOTION_API_KEY` and `EVAL_NOTION_API_KEY`).
c. Share the **Source Hub** with the Source integration, and the **Eval Hub parent page** with the Eval integration (*Full Access*).
[](https://postimg.cc/XXVGJD5H)
[](https://postimg.cc/NKrLShhM)
[](https://postimg.cc/CRDLJjDn)
[](https://postimg.cc/n9Cnm7pz)
[](https://postimg.cc/s1QFp35v)
---
## 2 · Authenticate with Notion
```bash
# First, install Playwright and the browser binaries
playwright install
# Then, run the Notion login helper with your preferred browser
python -m src.mcp_services.notion.notion_login_helper --browser {firefox|chromium}
```
The verification script will tell you which browser is working properly. The pipeline defaults to using **chromium**. Our pipeline has been **fully tested on macOS and Linux**.
## 3. Running Notion Tasks
1. Configure environment variables: make sure the following service credentials are added in `.mcp_env`.
```env
## Notion
SOURCE_NOTION_API_KEY="your-source-notion-api-key" # For Source Hub (templates)
EVAL_NOTION_API_KEY="your-eval-notion-api-key" # For Eval Hub (active evaluation)
SOURCE_PARENT_PAGE_TITLE="MCPMark Source Hub" # Source hub page name (exact match)
EVAL_PARENT_PAGE_TITLE="MCPMark Eval Hub" # Must match the name of the empty page you created in Eval Hub
PLAYWRIGHT_BROWSER="chromium" # default to chromium, you can also choose firefox
PLAYWRIGHT_HEADLESS="True"
```
2. For single task or task group, run
```bash
python -m pipeline --exp-name EXPNAME --mcp notion --tasks NOTIONTASK --models MODEL --k K
```
Here *EXPNAME* refers to customized experiment name, *NOTIONTASK* refers to the notion task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.
================================================
FILE: docs/mcp/playwright.md
================================================
# Playwright
This guide walks you through setting up WebArena environments for Playwright MCP automated testing, including Shopping, Shopping Admin, and Reddit instances.
Section 1 is designed mainly for completing the Playwright-WebArena tasks.
## 1. Setup WebArena Environment (For Playwright-WebArena Tasks)
### 1.1 Download Docker Images
[WebArena](https://github.com/web-arena-x/webarena/tree/main/environment_docker) provides Docker images from multiple sources. Choose the fastest one for your network:
### Shopping Environment (Port 7770)
```bash
# Option 1: Google Drive (Recommended)
pip install gdown
gdown 1gxXalk9O0p9eu1YkIJcmZta1nvvyAJpA
# Option 2: Archive.org
wget https://archive.org/download/webarena-env-shopping-image/shopping_final_0712.tar
# Option 3: CMU Server
wget http://metis.lti.cs.cmu.edu/webarena-images/shopping_final_0712.tar
```
### Shopping Admin Environment (Port 7780)
```bash
# Option 1: Google Drive (Recommended)
gdown 1See0ZhJRw0WTTL9y8hFlgaduwPZ_nGfd
# Option 2: Archive.org
wget https://archive.org/download/webarena-env-shopping-admin-image/shopping_admin_final_0719.tar
# Option 3: CMU Server
wget http://metis.lti.cs.cmu.edu/webarena-images/shopping_admin_final_0719.tar
```
### Reddit Environment (Port 9999)
```bash
# Option 1: Google Drive (Recommended)
gdown 17Qpp1iu_mPqzgO_73Z9BnFjHrzmX9DGf
# Option 2: Archive.org
wget https://archive.org/download/webarena-env-forum-image/postmill-populated-exposed-withimg.tar
# Option 3: CMU Server
wget http://metis.lti.cs.cmu.edu/webarena-images/postmill-populated-exposed-withimg.tar
```
### 1.2 Deploy Environments
#### Shopping (E-commerce Site)
```bash
docker load --input shopping_final_0712.tar
# Start container
docker run --name shopping -p 7770:80 -d shopping_final_0712
# Wait for service initialization (2-3 minutes)
sleep 180
# Configure for local access
docker exec shopping /var/www/magento2/bin/magento setup:store-config:set --base-url="http://localhost:7770"
docker exec shopping mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://localhost:7770/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');"
docker exec shopping /var/www/magento2/bin/magento cache:flush
```
**Access**: `http://localhost:7770`
#### Shopping Admin (Management Panel)
```bash
docker load --input shopping_admin_final_0719.tar
# Start container
docker run --name shopping_admin -p 7780:80 -d shopping_admin_final_0719
# Wait for service initialization
sleep 120
# Configure for local access
docker exec shopping_admin /var/www/magento2/bin/magento setup:store-config:set --base-url="http://localhost:7780"
docker exec shopping_admin mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://localhost:7780/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');"
docker exec shopping_admin php /var/www/magento2/bin/magento config:set admin/security/password_is_forced 0
docker exec shopping_admin php /var/www/magento2/bin/magento config:set admin/security/password_lifetime 0
docker exec shopping_admin /var/www/magento2/bin/magento cache:flush
```
**Access**: `http://localhost:7780/admin`
**Admin Credentials**: `admin / admin1234`
#### Reddit (Forum)
```bash
docker load --input postmill-populated-exposed-withimg.tar
# Start container
docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg
# Wait for PostgreSQL initialization
sleep 120
# Verify service status
docker logs forum | grep "database system is ready"
curl -I http://localhost:9999
```
**Access**: `http://localhost:9999`
### 1.3 External Access Configuration
For cloud deployments (GCP, AWS, etc.), configure external access:
#### Configure Firewall (GCP Example)
```bash
# Shopping environment
gcloud compute firewall-rules create allow-shopping-7770 \
--allow tcp:7770 --source-ranges 0.0.0.0/0
# Shopping Admin
gcloud compute firewall-rules create allow-shopping-admin-7780 \
--allow tcp:7780 --source-ranges 0.0.0.0/0
# Reddit
gcloud compute firewall-rules create allow-reddit-9999 \
--allow tcp:9999 --source-ranges 0.0.0.0/0
```
#### Update Base URLs for External Access
```bash
# Get external IP
EXTERNAL_IP=$(curl -s ifconfig.me)
# Shopping
docker exec shopping /var/www/magento2/bin/magento setup:store-config:set --base-url="http://${EXTERNAL_IP}:7770"
docker exec shopping mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://${EXTERNAL_IP}:7770/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');"
docker exec shopping /var/www/magento2/bin/magento cache:flush
# Shopping Admin
docker exec shopping_admin /var/www/magento2/bin/magento setup:store-config:set --base-url="http://${EXTERNAL_IP}:7780"
docker exec shopping_admin mysql -u magentouser -pMyPassword magentodb -e "UPDATE core_config_data SET value='http://${EXTERNAL_IP}:7780/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');"
docker exec shopping_admin /var/www/magento2/bin/magento cache:flush
```
### 1.4 Alternative Access Methods (Not Verified)
#### Cloudflared Tunnel (Free & Persistent)
```bash
# Install cloudflared
wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
sudo mv cloudflared-linux-amd64 /usr/local/bin/cloudflared
sudo chmod +x /usr/local/bin/cloudflared
# Create tunnels
cloudflared tunnel --url http://localhost:7770 # Shopping
cloudflared tunnel --url http://localhost:7780 # Admin
cloudflared tunnel --url http://localhost:9999 # Reddit
```
#### ngrok (Quick Sharing)
```bash
# Install ngrok
wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
tar xvzf ngrok-v3-stable-linux-amd64.tgz
sudo mv ngrok /usr/local/bin
# Create tunnel (choose port)
ngrok http 7770 # For Shopping
```
## 2. Running Playwright Tasks
1. Configure environment variables: make sure the following service credentials are added in `.mcp_env`.
```env
PLAYWRIGHT_BROWSER="chromium" # default to chromium, you can also choose firefox
PLAYWRIGHT_HEADLESS="True"
```
2. For single task or task group, run
```bash
python -m pipeline --exp-name EXPNAME --mcp MCP --tasks PLAYWRIGHTTASK --models MODEL
```
Here *EXPNAME* refers to customized experiment name, *MCP* refers to playwright or playwright_webarena denpending on the task, *PLAYWRIGHTTASK* refers to the task or task group selected (see [Task Page](../datasets/task.md) for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.
## 3. Troubleshooting
### Container Issues
```bash
# Check status
docker ps -a | grep -E "shopping|forum"
# View logs
docker logs [container_name] --tail 50
# Restart container
docker restart [container_name]
```
### Access Problems
- **First load is slow** (1-2 minutes for Magento) - this is normal
- **Ensure ports are available**: `netstat -tlnp | grep -E "7770|7780|9999"`
- **Clear cache after URL changes**: Required for Magento environments
### Reset Environment
```bash
# Stop and remove container
docker stop [container_name]
docker rm [container_name]
# Re-deploy (follow steps in Section 3)
```
## 4. Important Notes
- **Service startup time**: Allow 2-3 minutes for Magento, 1-2 minutes for Reddit
- **Memory requirements**: Ensure Docker has at least 4GB RAM allocated per container
- **URL configuration**: Must reconfigure base URLs after container restart for external access
- **Port assignments**:
- 7770: Shopping
- 7780: Shopping Admin
- 9999: Reddit
================================================
FILE: docs/mcp/postgres.md
================================================
# PostgreSQL
This guide walks you through preparing your PostgreSQL environment for MCPMark evaluation.
## 1. Setup PostgreSQL Environment
### 1.1 Start PostgreSQL with Docker
1. **Run PostgreSQL Container**
Start a PostgreSQL instance using Docker:
```bash
docker run -d \
--name mcpmark-postgres \
-e POSTGRES_PASSWORD=password \
-e POSTGRES_USER=postgres \
-p 5432:5432 \
pgvector/pgvector:0.8.0-pg17-bookworm
```
2. **Verify Container is Running**
```bash
docker ps | grep mcpmark-postgres
```
---
### 1.2 Import Sample Databases
1. **Download Database Backups**
Download the backup files and place them in `./postgres_state/` directory:
```bash
mkdir -p ./postgres_state
cd ./postgres_state
# Download all database backups
wget https://storage.mcpmark.ai/postgres/employees.backup
wget https://storage.mcpmark.ai/postgres/chinook.backup
wget https://storage.mcpmark.ai/postgres/dvdrental.backup
wget https://storage.mcpmark.ai/postgres/sports.backup
wget https://storage.mcpmark.ai/postgres/lego.backup
cd ..
```
2. **Create Databases and Restore from Backups**
> Make sure your Postgres client version matches the server's version (e.g., pg17).
```bash
# Set the password environment variable
export PGPASSWORD=password
# Create and restore each database
createdb -h localhost -U postgres employees
pg_restore -h localhost -U postgres -d employees -v ./postgres_state/employees.backup
createdb -h localhost -U postgres chinook
pg_restore -h localhost -U postgres -d chinook -v ./postgres_state/chinook.backup
createdb -h localhost -U postgres dvdrental
pg_restore -h localhost -U postgres -d dvdrental -v ./postgres_state/dvdrental.backup
createdb -h localhost -U postgres sports
pg_restore -h localhost -U postgres -d sports -v ./postgres_state/sports.backup
createdb -h localhost -U postgres lego
pg_restore -h localhost -U postgres -d lego -v ./postgres_state/lego.backup
```
3. **Verify Databases are Imported**
```bash
# List all databases
PGPASSWORD=password psql -h localhost -U postgres -c "\l"
```
---
## 2. Configure Environment Variables
Configure environment variables: make sure the following enservice credentials are added in `.mcp_env`:
```env
## PostgreSQL Configuration
POSTGRES_HOST="localhost"
POSTGRES_PORT="5432"
POSTGRES_USERNAME="postgres"
POSTGRES_PASSWORD="password"
```
## 3. Verify Connection
Verify the PostgreSQL setup is working correctly:
```bash
# Test connection using psql
PGPASSWORD=password psql -h localhost -U postgres -c "SELECT version();"
```
## 4. Common Operations
### Stop PostgreSQL Container
```bash
docker stop mcpmark-postgres
```
### Start PostgreSQL Container
```bash
docker start mcpmark-postgres
```
### Remove PostgreSQL Container (Clean Setup)
```bash
docker stop mcpmark-postgres
docker rm mcpmark-postgres
```
### Access PostgreSQL Shell
```bash
PGPASSWORD=mysecretpassword psql -h localhost -U postgres
```
## 5. Running Postgres Experiment
For single task or task group, run
```bash
python -m pipeline --exp-name EXPNAME --mcp postgres --tasks POSTGRESTASK --models MODEL
```
Here *EXPNAME* refers to customized experiment name, *POSTGRESTASK* refers to the postgres task or task group selected (see `tasks/` for specific task information), *MODEL* refers to the selected model (see [Introduction Page](../introduction.md) for model supported), *K* refers to the time of independent experiments.
## 6. Troubleshooting
### Port Already in Use
If port 5432 is already in use, you can use a different port:
```bash
docker run -d \
```bash
docker run -d \
--name mcpmark-postgres \
-e POSTGRES_PASSWORD=password \
-e POSTGRES_USER=postgres \
-p 5433:5432 \
pgvector/pgvector:0.8.0-pg17-bookworm
```
Remember to update `POSTGRES_PORT="5433"` in your `.mcp_env` file.
### Connection Refused
Ensure the Docker container is running and the port mapping is correct:
```bash
docker ps
docker logs mcpmark-postgres
```
================================================
FILE: docs/quickstart.md
================================================
# Quick Start
To quickly experience MCPMark, we recommend firstly preparing the environment, and then execute the Postgres tasks.
### 1. Clone MCPMark
```bash
git clone https://github.com/eval-sys/mcpmark.git
cd mcpmark
```
### 2. Setup Environment Variables
To setup the model access in environment variable, edit the `.mcp_env` file in `mcpmark/`.
```env
# Model Providers (set only those you need)
## Google Gemini
GEMINI_BASE_URL="https://your-gemini-base-url.com/v1"
GEMINI_API_KEY="your-gemini-api-key"
## DeepSeek
DEEPSEEK_BASE_URL="https://your-deepseek-base-url.com/v1"
DEEPSEEK_API_KEY="your-deepseek-api-key"
## OpenAI
OPENAI_BASE_URL="https://your-openai-base-url.com/v1"
OPENAI_API_KEY="your-openai-api-key"
## Anthropic
ANTHROPIC_BASE_URL="https://your-anthropic-base-url.com/v1"
ANTHROPIC_API_KEY="your-anthropic-api-key"
## Moonshot
MOONSHOT_BASE_URL="https://your-moonshot-base-url.com/v1"
MOONSHOT_API_KEY="your-moonshot-api-key"
## xAI
XAI_BASE_URL="https://your-xai-base-url.com/v1"
XAI_API_KEY="your-xai-api-key"
```
### 3. Run Quick Example in MCPMark
Suppose you are running the employee query task with gemini-2.5-flash, and name your experiment as test-run-1, you can use the following command to test the `size_classification` task in `file_property`, which categorizes files by their sizes.
```bash
python -m pipeline
--exp-name test-run-1
--mcp filesystem
--tasks file_property/size_classification
--models gemini-2.5-flash
```
Here is the expected output (the verification may encounter failure due to model choices).
[](https://postimg.cc/Yj8nPZkQ)
The reuslts are saved under `restuls/{exp_name}/{mcp}_{model}/{tasks}`, if `exp-name` is not specified, the default name would be timestamp of the experiment (but specifying the `exp-name` is useful for resuming experiments).
For other MCP services, please refers to the [Installation and Docker Usage Page](./installation_and_docker_usage.md) for detailed instruction.
================================================
FILE: pipeline.py
================================================
#!/usr/bin/env python3
"""
MCPMark Unified Evaluation Pipeline
===================================
This script provides an automated evaluation pipeline for testing Large Language Models (LLMs)
on various Multi-Step Cognitive Processes (MCP) services like Notion, GitHub, and PostgreSQL.
"""
import argparse
import sys
from datetime import datetime
from pathlib import Path
from dotenv import load_dotenv
from src.logger import get_logger
from src.evaluator import MCPEvaluator
from src.agents import AGENT_REGISTRY
from src.factory import MCPServiceFactory
from src.model_config import ModelConfig
# Suppress httpcore/anyio cleanup exceptions that don't affect functionality.
# These "Exception ignored" messages are caused by MCP library's streamablehttp_client
# timing issues during cleanup, but don't impact actual task execution.
def _suppress_cleanup_exceptions(unraisable):
"""Suppress known cleanup exceptions from httpcore/anyio."""
msg = str(unraisable.exc_value)
if any(
pattern in msg
for pattern in [
"async generator ignored GeneratorExit",
"cancel scope in a different task",
"no running event loop",
]
):
return # Silently ignore
# Use default handler for other exceptions
sys.__unraisablehook__(unraisable)
sys.unraisablehook = _suppress_cleanup_exceptions
# Initialize logger
logger = get_logger(__name__)
def main():
"""Main entry point for the evaluation pipeline."""
parser = argparse.ArgumentParser(description="MCPMark Unified Evaluation Pipeline.")
supported_mcp_services = MCPServiceFactory.get_supported_mcp_services()
supported_models = ModelConfig.get_supported_models()
# Main configuration
parser.add_argument(
"--mcp",
default="filesystem",
choices=supported_mcp_services,
help="MCP service to use (default: filesystem)",
)
parser.add_argument(
"--models",
required=True,
help="Comma-separated list of models to evaluate (e.g., 'o3,k2,gpt-4.1')",
)
parser.add_argument(
"--agent",
default="mcpmark",
choices=sorted(AGENT_REGISTRY.keys()),
help="Agent implementation to use (default: mcpmark)",
)
parser.add_argument(
"--tasks",
default="all",
help='Tasks to run: (1). "all"; (2). "category"; or (3). "category/task".',
)
parser.add_argument(
"--task-suite",
default="standard",
choices=["standard", "easy"],
help="Task suite to run (default: standard). Use 'easy' to run the lightweight dataset.",
)
parser.add_argument(
"--exp-name",
default=None,
help="Experiment name; results are saved under results// (default: YYYY-MM-DD-HH-MM-SS)",
)
parser.add_argument(
"--k",
type=int,
default=4,
help="Number of evaluation runs (default: 1)",
)
# Execution configuration
parser.add_argument(
"--timeout",
type=int,
default=3600,
help="Timeout in seconds for agent execution",
)
parser.add_argument(
"--compaction-token",
type=int,
default=999_999_999,
help=(
"Auto-compact conversation when prompt tokens (from API usage) reach this limit. "
"Use 999999999 to disable compaction."
),
)
parser.add_argument(
"--reasoning-effort",
default="default",
choices=["default", "minimal", "low", "medium", "high"],
help="Reasoning effort level for supported models (default: None)",
)
# Output configuration
parser.add_argument(
"--output-dir",
type=Path,
default=Path("./results"),
help="Directory to save results",
)
# Load arguments and environment variables
args = parser.parse_args()
load_dotenv(dotenv_path=".mcp_env", override=False)
# Validate k parameter and exp-name requirement
if args.k > 1 and args.exp_name is None:
parser.error("--exp-name is required when k > 1")
# Generate default exp-name if not provided
if args.exp_name is None:
args.exp_name = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
# Parse models (no validation - allow unsupported models)
model_list = [m.strip() for m in args.models.split(",") if m.strip()]
if not model_list:
parser.error("No valid models provided")
# Log warning for unsupported models but don't error
unsupported_models = [m for m in model_list if m not in supported_models]
if unsupported_models:
logger.warning(
f"Using unsupported models: {', '.join(unsupported_models)}. Will use OPENAI_BASE_URL and OPENAI_API_KEY from environment."
)
logger.info("MCPMark Evaluation")
logger.info(
f"Experiment: {args.exp_name} | {len(model_list)} Model(s): {', '.join(model_list)}"
)
logger.info(f"Task suite: {args.task_suite}")
if args.k > 1:
logger.info(f"Running {args.k} evaluation runs for pass@k metrics")
# Run k evaluation runs
for run_idx in range(1, args.k + 1):
if args.k > 1:
logger.info(f"\n{'=' * 80}")
logger.info(f"Starting Run {run_idx}/{args.k}")
logger.info(f"{'=' * 80}\n")
# For k-runs, results/{exp}/{mcp}__{model}/run-N
run_exp_name = f"run-{run_idx}"
run_output_dir = args.output_dir / args.exp_name
else:
# For single run, still use run-1 under service_model
run_exp_name = "run-1"
run_output_dir = args.output_dir / args.exp_name
# Run evaluation for each model
for i, model in enumerate(model_list, 1):
logger.info(f"\n{'=' * 60}")
if args.k > 1:
logger.info(
f"Run {run_idx}/{args.k} | Model {i}/{len(model_list)}: {model}"
)
else:
logger.info(f"Starting evaluation {i}/{len(model_list)}: {model}")
logger.info(f"{'=' * 60}\n")
# Initialize and run the evaluation pipeline for this model
pipeline = MCPEvaluator(
mcp_service=args.mcp,
model=model,
timeout=args.timeout,
exp_name=run_exp_name,
output_dir=run_output_dir,
reasoning_effort=args.reasoning_effort,
agent_name=args.agent,
task_suite=args.task_suite,
compaction_token=args.compaction_token,
)
pipeline.run_evaluation(args.tasks)
logger.info(f"📁 Results: {pipeline.base_experiment_dir}")
logger.info(f"\n{'=' * 60}")
if args.k > 1:
logger.info(f"✓ All {args.k} runs completed for {len(model_list)} model(s)")
logger.info(
f"Run `python -m src.aggregators.aggregate_results --exp-name {args.exp_name}` to compute all metrics"
)
else:
logger.info(f"✓ All evaluations completed for {len(model_list)} model(s)")
logger.info(f"{'=' * 60}")
if __name__ == "__main__":
main()
================================================
FILE: pyproject.toml
================================================
[project]
authors = []
name = "MCPMark"
requires-python = ">= 3.11"
version = "0.0.1"
dependencies = [
"notion-client==2.4.0",
"playwright>=1.43.0",
"seaborn>=0.12.0",
"matplotlib>=3.7.0",
"numpy>=1.23.0",
"openai-agents>=0.2.3,<0.3",
"openai>=1.96.1",
"python-dotenv>=1.1.1,<2",
"ruff>=0.12.4,<0.13",
"psycopg2-binary>=2.9.10,<3",
"pyyaml>=6.0.2,<7",
"nest-asyncio>=1.6.0,<2",
"pixi",
"pipx>=1.7.1,<2",
"pgdumplib>=3.1.0,<4",
"litellm==1.80.0"
]
[build-system]
build-backend = "hatchling.build"
requires = ["hatchling"]
[tool.pixi.workspace]
channels = ["conda-forge"]
platforms = [
"osx-arm64",
"linux-aarch64",
"linux-64",
"win-64",
"osx-64",
]
[tool.pixi.tasks]
fmt = "ruff"
[tool.ruff.format]
indent-style = "space"
line-ending = "auto"
[tool.hatch.build.targets.wheel]
packages = ["src", "tasks"]
================================================
FILE: run-benchmark.sh
================================================
#!/bin/bash
# MCPMark Full Benchmark Runner
# Runs all tasks across all MCP services for comprehensive model evaluation
set -e
# Default values
MODELS=""
EXP_NAME=""
USE_DOCKER=false
SERVICES="filesystem,notion,github,postgres,playwright"
PARALLEL=false
TIMEOUT=3600
K=4
# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Function to print colored output
print_status() {
echo -e "${BLUE}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1"
}
print_success() {
echo -e "${GREEN}✓${NC} $1"
}
print_warning() {
echo -e "${YELLOW}⚠${NC} $1"
}
print_error() {
echo -e "${RED}✗${NC} $1"
}
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--models)
MODELS="$2"
shift 2
;;
--exp-name)
EXP_NAME="$2"
shift 2
;;
--docker)
USE_DOCKER=true
shift
;;
--mcps)
SERVICES="$2"
shift 2
;;
--parallel)
PARALLEL=true
shift
;;
--timeout)
TIMEOUT="$2"
shift 2
;;
--k)
K="$2"
shift 2
;;
--help)
cat << EOF
Usage: $0 --models MODELS --exp-name NAME [OPTIONS]
Run comprehensive benchmark across all MCP services.
Required Options:
--models MODELS Comma-separated list of models to evaluate
(e.g., "o3,gpt-4.1,claude-4-sonnet")
--exp-name NAME Experiment name for organizing results
Optional Options:
--docker Run tasks in Docker containers (recommended)
--mcps SERVICES Comma-separated list of services to test
Default: filesystem,notion,github,postgres,playwright
--parallel Run services in parallel (experimental)
--timeout SECONDS Timeout per task in seconds (default: 300)
--k RUNS Repeat runs per service for pass@k (default: 4)
Examples:
# Run all services with Docker
$0 --models o3,gpt-4.1 --exp-name benchmark-1 --docker
# Run specific services locally
$0 --models o3 --exp-name test-1 --mcps filesystem,postgres
# Run with parallel execution
$0 --models claude-4 --exp-name parallel-test --docker --parallel
EOF
exit 0
;;
*)
print_error "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Validate required arguments
if [ -z "$MODELS" ]; then
print_error "Error: --models is required"
exit 1
fi
if [ -z "$EXP_NAME" ]; then
print_error "Error: --exp-name is required"
exit 1
fi
# Check prerequisites
if [ "$USE_DOCKER" = true ]; then
if ! command -v docker &> /dev/null; then
print_error "Docker is not installed"
exit 1
fi
# Always use Docker Hub image
DOCKER_IMAGE="evalsysorg/mcpmark:latest"
# Check if Docker image exists locally, pull only if not found
if ! docker image inspect "$DOCKER_IMAGE" >/dev/null 2>&1; then
print_status "Docker image not found locally, pulling from Docker Hub..."
docker pull "$DOCKER_IMAGE" || {
print_error "Failed to pull Docker image from Docker Hub"
exit 1
}
else
print_status "Using local Docker image: $DOCKER_IMAGE"
fi
else
# Check Python installation
if ! command -v python3 &> /dev/null; then
print_error "Python 3 is not installed"
exit 1
fi
# Check if dependencies are installed
if ! python3 -c "import src.evaluator" 2>/dev/null; then
print_warning "Python dependencies not installed"
echo "Installing dependencies..."
pip install -e . || {
print_error "Failed to install dependencies"
exit 1
}
fi
fi
# Check .mcp_env file
if [ ! -f .mcp_env ]; then
print_warning ".mcp_env file not found. Some tasks may fail without API credentials."
echo "Create one from .mcp_env.example: cp .mcp_env.example .mcp_env"
fi
# Convert comma-separated services to array
IFS=',' read -ra SERVICE_ARRAY <<< "$SERVICES"
# Summary
echo ""
print_status "MCPMark Benchmark Configuration"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Models: $MODELS"
echo "Experiment: $EXP_NAME"
echo "Services: ${SERVICE_ARRAY[*]}"
echo "Docker: $USE_DOCKER"
echo "Parallel: $PARALLEL"
echo "Timeout: ${TIMEOUT}s per task"
echo "K-Runs: $K"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
# Create results directory
RESULTS_DIR="./results/${EXP_NAME}"
mkdir -p "$RESULTS_DIR"
# Log file for this run with timestamp and models
TIMESTAMP=$(date '+%Y%m%d_%H%M%S')
LOG_FILE="${RESULTS_DIR}/benchmark_${TIMESTAMP}.log"
echo "Benchmark started at $(date '+%Y-%m-%d %H:%M:%S')" > "$LOG_FILE"
echo "Models: $MODELS" >> "$LOG_FILE"
echo "Services: ${SERVICE_ARRAY[*]}" >> "$LOG_FILE"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" >> "$LOG_FILE"
# Function to run a single service
run_service() {
local service=$1
local start_time=$(date +%s)
local start_time_formatted=$(date '+%Y-%m-%d %H:%M:%S')
print_status "[$start_time_formatted] Starting $service tasks..."
if [ "$USE_DOCKER" = true ]; then
# Run with Docker
./run-task.sh --mcp "$service" \
--models "$MODELS" \
--exp-name "$EXP_NAME" \
--tasks all \
--timeout "$TIMEOUT" \
--k "$K" 2>&1 | tee -a "$LOG_FILE"
else
# Run locally
python3 -m pipeline \
--mcp "$service" \
--models "$MODELS" \
--exp-name "$EXP_NAME" \
--tasks all \
--timeout "$TIMEOUT" \
--k "$K" 2>&1 | tee -a "$LOG_FILE"
fi
local exit_code=$?
local end_time=$(date +%s)
local duration=$((end_time - start_time))
if [ $exit_code -eq 0 ]; then
print_success "$service completed in ${duration}s"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $service: SUCCESS (${duration}s)" >> "${RESULTS_DIR}/summary.txt"
else
print_error "$service failed with exit code $exit_code"
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $service: FAILED (exit code $exit_code)" >> "${RESULTS_DIR}/summary.txt"
fi
return $exit_code
}
# Track overall results
TOTAL_SERVICES=${#SERVICE_ARRAY[@]}
COMPLETED_SERVICES=0
FAILED_SERVICES=0
# Main execution
BENCHMARK_START=$(date +%s)
if [ "$PARALLEL" = true ]; then
print_status "Running services in parallel..."
# Run all services in background
for service in "${SERVICE_ARRAY[@]}"; do
(
run_service "$service"
) &
pids+=($!)
done
# Wait for all background jobs and collect exit codes
for pid in "${pids[@]}"; do
if wait $pid; then
((COMPLETED_SERVICES++))
else
((FAILED_SERVICES++))
fi
done
else
print_status "Running services sequentially..."
for service in "${SERVICE_ARRAY[@]}"; do
if run_service "$service"; then
((COMPLETED_SERVICES++))
else
((FAILED_SERVICES++))
print_warning "Continuing despite failure in $service"
fi
done
fi
BENCHMARK_END=$(date +%s)
TOTAL_DURATION=$((BENCHMARK_END - BENCHMARK_START))
# Generate final summary
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
print_status "Benchmark Summary"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Completed at: $(date '+%Y-%m-%d %H:%M:%S')"
echo "Total Services: $TOTAL_SERVICES"
echo "Completed: $COMPLETED_SERVICES"
echo "Failed: $FAILED_SERVICES"
echo "Total Duration: ${TOTAL_DURATION}s ($(($TOTAL_DURATION / 60))m $(($TOTAL_DURATION % 60))s)"
echo "Results saved to: $RESULTS_DIR"
echo "Log file: $LOG_FILE"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
# Final status
if [ $FAILED_SERVICES -eq 0 ]; then
print_success "Benchmark completed successfully!"
exit 0
else
print_warning "Benchmark completed with $FAILED_SERVICES failed service(s)"
exit 1
fi
================================================
FILE: run-task.sh
================================================
#!/bin/bash
# MCPMark Task Runner
# Enable strict error handling
set -euo pipefail
# Default values
SERVICE="filesystem"
NETWORK_NAME="mcp-network"
POSTGRES_CONTAINER="mcp-postgres"
# Resource limits (can be overridden by environment variables)
DOCKER_MEMORY_LIMIT="${DOCKER_MEMORY_LIMIT:-4g}"
DOCKER_CPU_LIMIT="${DOCKER_CPU_LIMIT:-2}"
# Cleanup function
cleanup() {
if [ "${SERVICE:-}" = "postgres" ]; then
if docker ps --format '{{.Names}}' | grep -q "^${POSTGRES_CONTAINER}$"; then
echo "Cleaning up PostgreSQL container..."
docker stop "$POSTGRES_CONTAINER" >/dev/null 2>&1 || true
docker rm "$POSTGRES_CONTAINER" >/dev/null 2>&1 || true
fi
fi
}
# Set up cleanup on exit
trap cleanup EXIT
# Parse arguments
while [[ $# -gt 0 ]]; do
case $1 in
--mcp) SERVICE="$2"; shift 2 ;;
--help)
cat << EOF
Usage: $0 [--mcp SERVICE] [PIPELINE_ARGS]
Run MCPMark tasks in Docker containers.
Options:
--mcp SERVICE MCP service (notion|github|filesystem|playwright|postgres)
Default: filesystem
Environment Variables:
DOCKER_MEMORY_LIMIT Memory limit for container (default: 4g)
DOCKER_CPU_LIMIT CPU limit for container (default: 2)
DOCKER_IMAGE_VERSION Docker image tag to use (default: latest)
All other arguments are passed directly to the pipeline.
Examples:
$0 --mcp notion --models o3 --exp-name test-1 --tasks all
$0 --mcp postgres --models gpt-4 --exp-name pg-test --tasks basic_queries
EOF
exit 0
;;
*) break ;; # Stop parsing, rest goes to pipeline
esac
done
# Docker image tag can be overridden by environment variable
DOCKER_IMAGE_REPO="evalsysorg/mcpmark"
DOCKER_IMAGE_VERSION="${DOCKER_IMAGE_VERSION:-latest}"
DOCKER_IMAGE="${DOCKER_IMAGE_REPO}:${DOCKER_IMAGE_VERSION}"
# Check if Docker image exists locally, pull only if not found
if ! docker image inspect "$DOCKER_IMAGE" >/dev/null 2>&1; then
echo "Docker image not found locally, pulling from Docker Hub..."
docker pull "$DOCKER_IMAGE" || {
echo "Error: Failed to pull Docker image from Docker Hub"
echo "Please check your internet connection or Docker Hub access"
exit 1
}
else
echo "Using local Docker image: $DOCKER_IMAGE"
fi
# Check if .mcp_env exists (warn but don't fail)
if [ ! -f .mcp_env ]; then
echo "Warning: .mcp_env file not found. Some tasks may fail without API credentials."
fi
# Create network if doesn't exist
if ! docker network ls --format '{{.Name}}' | grep -q "^${NETWORK_NAME}$"; then
echo "Creating Docker network: $NETWORK_NAME"
docker network create "$NETWORK_NAME" || {
echo "Error: Failed to create Docker network"
exit 1
}
fi
# Service-specific configurations
if [ "$SERVICE" = "postgres" ]; then
# For postgres service, ensure PostgreSQL container is running
if ! docker ps --format '{{.Names}}' | grep -q "^${POSTGRES_CONTAINER}$"; then
echo "Starting PostgreSQL container..."
docker run -d \
--name "$POSTGRES_CONTAINER" \
--network "$NETWORK_NAME" \
-e POSTGRES_DATABASE=postgres \
-e POSTGRES_USER=postgres \
-e POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-password}" \
pgvector/pgvector:0.8.0-pg17-bookworm
echo "Waiting for PostgreSQL to be ready..."
for i in {1..10}; do
if docker exec "$POSTGRES_CONTAINER" pg_isready -U postgres >/dev/null 2>&1; then
echo "PostgreSQL is ready!"
break
fi
sleep 1
done
else
echo "PostgreSQL container already running"
fi
# Run task with network connection to postgres
docker run --rm \
--memory="$DOCKER_MEMORY_LIMIT" \
--cpus="$DOCKER_CPU_LIMIT" \
--network "$NETWORK_NAME" \
-e POSTGRES_HOST="$POSTGRES_CONTAINER" \
-e POSTGRES_PORT=5432 \
-e POSTGRES_USERNAME=postgres \
-e POSTGRES_PASSWORD="${POSTGRES_PASSWORD:-password}" \
-e POSTGRES_DATABASE=postgres \
-v "$(pwd)/results:/app/results" \
-v "$(pwd)/postgres_state:/app/postgres_state" \
$([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
"$DOCKER_IMAGE" \
python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
elif [ "$SERVICE" = "filesystem" ]; then
# For filesystem service, mount test_environments
docker run --rm \
--memory="$DOCKER_MEMORY_LIMIT" \
--cpus="$DOCKER_CPU_LIMIT" \
-v "$(pwd)/results:/app/results" \
-v "$(pwd)/test_environments:/app/test_environments" \
$([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
"$DOCKER_IMAGE" \
python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
elif [ "$SERVICE" = "insforge" ]; then
# For Insforge service, use host network to access Insforge backend on host
docker run --rm \
--memory="$DOCKER_MEMORY_LIMIT" \
--cpus="$DOCKER_CPU_LIMIT" \
--add-host=host.docker.internal:host-gateway \
-v "$(pwd)/results:/app/results" \
$([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
"$DOCKER_IMAGE" \
python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
else
# For other services (notion, github, playwright, etc.)
docker run --rm \
--memory="$DOCKER_MEMORY_LIMIT" \
--cpus="$DOCKER_CPU_LIMIT" \
-v "$(pwd)/results:/app/results" \
-v "$(pwd)/test_environments:/app/test_environments" \
$([ -f .mcp_env ] && echo "-v $(pwd)/.mcp_env:/app/.mcp_env:ro") \
$([ -f notion_state.json ] && echo "-v $(pwd)/notion_state.json:/app/notion_state.json") \
"$DOCKER_IMAGE" \
python3 -m pipeline --mcp "$SERVICE" --k 1 "$@"
fi
echo "Task completed!"
================================================
FILE: src/agents/__init__.py
================================================
"""
MCPMark Agent Module
====================
Provides agent implementations and registry for MCPMark.
"""
from .base_agent import BaseMCPAgent
from .mcpmark_agent import MCPMarkAgent
from .react_agent import ReActAgent
AGENT_REGISTRY = {
"mcpmark": MCPMarkAgent,
"react": ReActAgent,
}
__all__ = ["BaseMCPAgent", "MCPMarkAgent", "ReActAgent", "AGENT_REGISTRY"]
================================================
FILE: src/agents/base_agent.py
================================================
"""Shared base agent functionality for MCPMark agents."""
from __future__ import annotations
import asyncio
import copy
import json
import uuid
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Callable
from src.logger import get_logger
from .mcp import MCPStdioServer, MCPHttpServer
from .utils import TokenUsageTracker
logger = get_logger(__name__)
class BaseMCPAgent(ABC):
"""Base class with shared functionality for MCPMark agents."""
STDIO_SERVICES = [
"notion",
"filesystem",
"playwright",
"playwright_webarena",
"postgres",
"insforge",
"github",
]
HTTP_SERVICES = ["supabase"]
DEFAULT_TIMEOUT = 600
COMPACTION_DISABLED_TOKEN = 999_999_999
CLAUDE_THINKING_BUDGETS = {
"low": 1024,
"medium": 2048,
"high": 4096,
}
def __init__(
self,
litellm_input_model_name: str,
api_key: str,
base_url: str,
mcp_service: str,
timeout: int = DEFAULT_TIMEOUT,
service_config: Optional[Dict[str, Any]] = None,
service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None,
reasoning_effort: Optional[str] = "default",
compaction_token: int = COMPACTION_DISABLED_TOKEN,
):
self.litellm_input_model_name = litellm_input_model_name
self.api_key = api_key
self.base_url = base_url
self.mcp_service = mcp_service
self.timeout = timeout
self.service_config = service_config or {}
self._service_config_provider = service_config_provider
self.reasoning_effort = reasoning_effort or "default"
self.compaction_token = int(compaction_token)
self.is_claude = self._is_anthropic_model(litellm_input_model_name)
self.use_claude_thinking = self.is_claude and self.reasoning_effort != "default"
self.usage_tracker = TokenUsageTracker()
self.litellm_run_model_name = None
self._partial_messages: List[Dict[str, Any]] = []
self._partial_token_usage: Dict[str, int] = {}
self._partial_turn_count: int = 0
logger.debug(
"Initialized %s for service '%s' with model '%s'",
self.__class__.__name__,
self.mcp_service,
self.litellm_input_model_name,
)
# Warn if Gemini 3 model uses unsupported reasoning_effort value
if self._is_gemini_3_model() and self.reasoning_effort not in [
"default",
"low",
"high",
]:
logger.warning(
"Gemini 3 models only support reasoning_effort 'low' or 'high', "
"got '%s'. LiteLLM may map this to the nearest supported value.",
self.reasoning_effort,
)
def __repr__(self) -> str: # pragma: no cover - debug helper
return (
f"{self.__class__.__name__}(service='{self.mcp_service}', "
f"model='{self.litellm_input_model_name}')"
)
@abstractmethod
async def execute(
self,
instruction: str,
tool_call_log_file: Optional[str] = None,
) -> Dict[str, Any]:
"""Execute the agent logic and return execution metadata."""
def execute_sync(
self,
instruction: str,
tool_call_log_file: Optional[str] = None,
) -> Dict[str, Any]:
"""Synchronous wrapper for async execution."""
return asyncio.run(self.execute(instruction, tool_call_log_file))
def get_usage_stats(self) -> Dict[str, Any]:
"""Return aggregated usage statistics."""
return self.usage_tracker.get_stats()
def reset_usage_stats(self):
"""Clear usage statistics."""
self.usage_tracker.reset()
# ------------------------------------------------------------------
# Shared helpers
# ------------------------------------------------------------------
def _is_anthropic_model(self, model_name: str) -> bool:
return "claude" in model_name.lower()
def _get_claude_thinking_budget(self) -> Optional[int]:
if not self.use_claude_thinking:
return None
return self.CLAUDE_THINKING_BUDGETS.get(self.reasoning_effort, 2048)
def _refresh_service_config(self):
if not self._service_config_provider:
return
try:
latest_cfg = self._service_config_provider() or {}
self.service_config.update(latest_cfg)
except Exception as exc: # pragma: no cover - best effort refresh
logger.warning("Failed to refresh service config: %s", exc)
def _reset_progress(self):
self._partial_messages = []
self._partial_token_usage = {}
self._partial_turn_count = 0
def _update_progress(
self,
messages: List[Dict[str, Any]],
token_usage: Dict[str, Any],
turn_count: int,
):
try:
self._partial_messages = copy.deepcopy(messages)
self._partial_token_usage = dict(token_usage or {})
self._partial_turn_count = int(turn_count or 0)
except Exception: # pragma: no cover - defensive copy
pass
# ------------------------------------------------------------------
# MCP server management
# ------------------------------------------------------------------
async def _create_mcp_server(self) -> Any:
if self.mcp_service in self.STDIO_SERVICES:
return self._create_stdio_server()
if self.mcp_service in self.HTTP_SERVICES:
return self._create_http_server()
raise ValueError(f"Unsupported MCP service: {self.mcp_service}")
def _create_stdio_server(self) -> MCPStdioServer:
if self.mcp_service == "notion":
notion_key = self.service_config.get("notion_key")
if not notion_key:
raise ValueError("Notion API key required")
return MCPStdioServer(
command="npx",
args=["-y", "@notionhq/notion-mcp-server"],
env={
"OPENAPI_MCP_HEADERS": (
'{"Authorization": "Bearer ' + notion_key + '", '
'"Notion-Version": "2022-06-28"}'
)
},
)
if self.mcp_service == "filesystem":
test_directory = self.service_config.get("test_directory")
if not test_directory:
raise ValueError("Test directory required for filesystem service")
return MCPStdioServer(
command="npx",
args=[
"-y",
"@modelcontextprotocol/server-filesystem",
str(test_directory),
],
)
if self.mcp_service in ("playwright", "playwright_webarena"):
browser = self.service_config.get("browser", "chromium")
headless = self.service_config.get("headless", True)
viewport_width = self.service_config.get("viewport_width", 1280)
viewport_height = self.service_config.get("viewport_height", 720)
args = ["-y", "@playwright/mcp@latest"]
if headless:
args.append("--headless")
args.extend(
[
"--isolated",
"--no-sandbox",
"--browser",
browser,
"--viewport-size",
f"{viewport_width},{viewport_height}",
]
)
return MCPStdioServer(command="npx", args=args)
if self.mcp_service == "postgres":
host = self.service_config.get("host", "localhost")
port = self.service_config.get("port", 5432)
username = self.service_config.get("username")
password = self.service_config.get("password")
database = self.service_config.get(
"current_database"
) or self.service_config.get("database")
if not all([username, password, database]):
raise ValueError("PostgreSQL requires username, password, and database")
database_url = (
f"postgresql://{username}:{password}@{host}:{port}/{database}"
)
return MCPStdioServer(
command="pipx",
args=["run", "postgres-mcp", "--access-mode=unrestricted"],
env={"DATABASE_URI": database_url},
)
if self.mcp_service == "insforge":
api_key = self.service_config.get("api_key")
backend_url = self.service_config.get("backend_url")
if not all([api_key, backend_url]):
raise ValueError("Insforge requires api_key and backend_url")
return MCPStdioServer(
command="npx",
args=["-y", "@insforge/mcp@dev"],
env={
"INSFORGE_API_KEY": api_key,
"INSFORGE_BACKEND_URL": backend_url,
},
)
raise ValueError(f"Unsupported stdio service: {self.mcp_service}")
def _create_http_server(self) -> MCPHttpServer:
if self.mcp_service == "github":
github_token = self.service_config.get("github_token")
if not github_token:
raise ValueError("GitHub token required")
return MCPHttpServer(
url="https://api.githubcopilot.com/mcp/",
headers={
"Authorization": f"Bearer {github_token}",
"User-Agent": "MCPMark/1.0",
},
)
raise ValueError(f"Unsupported HTTP service: {self.mcp_service}")
# ------------------------------------------------------------------
# Message/Tool formatting helpers
# ------------------------------------------------------------------
def _compaction_enabled(self) -> bool:
return 0 < self.compaction_token < self.COMPACTION_DISABLED_TOKEN
def _count_prompt_tokens_litellm(self, messages: List[Dict[str, Any]]) -> int:
try:
from litellm import token_counter
return int(
token_counter(model=self.litellm_input_model_name, messages=messages)
or 0
)
except Exception: # pragma: no cover - best effort
return 0
def _convert_to_sdk_format(
self, messages: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
sdk_format: List[Dict[str, Any]] = []
function_call_map: Dict[str, str] = {}
for msg in messages:
role = msg.get("role")
if role == "user":
user_content = msg.get("content", "")
if isinstance(user_content, list):
tool_results = [
item
for item in user_content
if isinstance(item, dict) and item.get("type") == "tool_result"
]
if tool_results:
for tr in tool_results:
content_items = tr.get("content", [])
text_content = ""
for ci in content_items:
if isinstance(ci, dict) and ci.get("type") == "text":
text_content = ci.get("text", "")
break
sdk_format.append(
{
"call_id": tr.get("tool_use_id", ""),
"output": json.dumps(
{
"type": "text",
"text": text_content,
"annotations": None,
"meta": None,
}
),
"type": "function_call_output",
}
)
else:
text_parts = []
for item in user_content:
if isinstance(item, dict) and item.get("type") == "text":
text_parts.append(item.get("text", ""))
sdk_format.append(
{"content": "\n".join(text_parts), "role": "user"}
)
else:
sdk_format.append({"content": user_content, "role": "user"})
elif role == "assistant":
tool_calls = msg.get("tool_calls", [])
function_call = msg.get("function_call")
content = msg.get("content")
if isinstance(content, list):
text_parts = []
claude_tool_uses = []
for block in content:
if isinstance(block, dict):
if block.get("type") == "text":
text_parts.append(block.get("text", ""))
elif block.get("type") == "thinking":
thinking_text = block.get("thinking", "")
if thinking_text:
text_parts.append(
f"\n{thinking_text}\n"
)
elif block.get("type") == "tool_use":
claude_tool_uses.append(block)
content = "\n".join(text_parts)
if claude_tool_uses and not tool_calls:
tool_calls = []
for tu in claude_tool_uses:
tool_calls.append(
{
"id": tu.get("id"),
"function": {
"name": tu.get("name"),
"arguments": json.dumps(tu.get("input", {})),
},
}
)
if content:
sdk_format.append(
{
"id": "__fake_id__",
"content": [
{
"annotations": [],
"text": content,
"type": "output_text",
}
],
"role": "assistant",
"status": "completed",
"type": "message",
}
)
if tool_calls:
for tool_call in tool_calls:
call_id = tool_call.get("id", f"call_{uuid.uuid4().hex}")
func_name = tool_call.get("function", {}).get("name", "")
sdk_format.append(
{
"arguments": tool_call.get("function", {}).get(
"arguments", "{}"
),
"call_id": call_id,
"name": func_name,
"type": "function_call",
"id": "__fake_id__",
}
)
if function_call:
func_name = function_call.get("name", "")
call_id = f"call_{uuid.uuid4().hex}"
function_call_map[func_name] = call_id
sdk_format.append(
{
"arguments": function_call.get("arguments", "{}"),
"call_id": call_id,
"name": func_name,
"type": "function_call",
"id": "__fake_id__",
}
)
elif role == "tool":
sdk_format.append(
{
"call_id": msg.get("tool_call_id", ""),
"output": json.dumps(
{
"type": "text",
"text": msg.get("content", ""),
"annotations": None,
"meta": None,
}
),
"type": "function_call_output",
}
)
elif role == "function":
func_name = msg.get("name", "")
call_id = function_call_map.get(func_name, f"call_{uuid.uuid4().hex}")
sdk_format.append(
{
"call_id": call_id,
"output": json.dumps(
{
"type": "text",
"text": msg.get("content", ""),
"annotations": None,
"meta": None,
}
),
"type": "function_call_output",
}
)
return sdk_format
def _convert_to_anthropic_format(
self, tools: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
anthropic_tools = []
for tool in tools:
anthropic_tool = {
"name": tool.get("name"),
"description": tool.get("description", ""),
"input_schema": tool.get(
"inputSchema",
{"type": "object", "properties": {}, "required": []},
),
}
anthropic_tools.append(anthropic_tool)
return anthropic_tools
def _is_gemini_model(self) -> bool:
model_lower = self.litellm_input_model_name.lower()
return "gemini" in model_lower or "bison" in model_lower
def _is_gemini_3_model(self) -> bool:
"""Check if this is a Gemini 3 series model."""
model_lower = self.litellm_input_model_name.lower()
return "gemini-3" in model_lower or "gemini/gemini-3" in model_lower
def _simplify_schema_for_gemini(
self, schema: Optional[Dict[str, Any]]
) -> Dict[str, Any]:
if not isinstance(schema, dict):
return schema or {}
simplified: Dict[str, Any] = {}
for key, value in schema.items():
if key == "type" and isinstance(value, list):
simplified[key] = value[0] if value else "string"
elif key == "items" and isinstance(value, dict):
simplified[key] = self._simplify_schema_for_gemini(value)
elif key == "properties" and isinstance(value, dict):
simplified[key] = {
prop_key: self._simplify_schema_for_gemini(prop_val)
for prop_key, prop_val in value.items()
}
elif isinstance(value, dict):
simplified[key] = self._simplify_schema_for_gemini(value)
elif isinstance(value, list) and key not in ("required", "enum"):
simplified[key] = [
self._simplify_schema_for_gemini(item)
if isinstance(item, dict)
else item
for item in value
]
else:
simplified[key] = value
return simplified
def _convert_to_openai_format(
self, tools: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
functions = []
is_gemini = self._is_gemini_model()
if is_gemini:
logger.debug(
"Detected Gemini model '%s' – simplifying tool schemas",
self.litellm_input_model_name,
)
for tool in tools:
input_schema = tool.get(
"inputSchema", {"type": "object", "properties": {}, "required": []}
)
if is_gemini:
simplified = self._simplify_schema_for_gemini(input_schema)
if simplified != input_schema:
input_schema = simplified
logger.debug("Simplified schema for tool '%s'", tool.get("name"))
functions.append(
{
"name": tool.get("name"),
"description": tool.get("description", ""),
"parameters": input_schema,
}
)
if is_gemini:
logger.info("Converted %d tools for Gemini compatibility", len(functions))
return functions
================================================
FILE: src/agents/mcp/__init__.py
================================================
"""
MCP (Model Context Protocol) Components
========================================
Minimal MCP server implementations for MCPMark.
"""
from .stdio_server import MCPStdioServer
from .http_server import MCPHttpServer
__all__ = ["MCPStdioServer", "MCPHttpServer"]
================================================
FILE: src/agents/mcp/http_server.py
================================================
"""
Minimal MCP HTTP Server Implementation
=======================================
Provides HTTP-based MCP server communication for services like GitHub.
"""
import asyncio
from contextlib import AsyncExitStack
from typing import Any, Dict, List, Optional
from mcp import ClientSession
from mcp.client.streamable_http import streamablehttp_client
class MCPHttpServer:
"""
HTTP-based MCP client using the official MCP Python SDK
(Streamable HTTP transport).
"""
def __init__(
self,
url: str,
headers: Optional[Dict[str, str]] = None,
timeout: int = 30,
):
self.url = url.rstrip("/")
self.headers = headers or {}
self.timeout = timeout
self._stack: Optional[AsyncExitStack] = None
self.session: Optional[ClientSession] = None
self._tools_cache: Optional[List[Dict[str, Any]]] = None
async def __aenter__(self):
await self.start()
return self
async def __aexit__(self, exc_type, exc, tb):
await self.stop()
async def start(self):
"""Open Streamable HTTP transport and initialize MCP session."""
self._stack = AsyncExitStack()
read_stream, write_stream, _ = await self._stack.enter_async_context(
streamablehttp_client(self.url, headers=self.headers)
)
self.session = await self._stack.enter_async_context(ClientSession(read_stream, write_stream))
await asyncio.wait_for(self.session.initialize(), timeout=self.timeout)
async def stop(self):
"""Close the session/transport cleanly."""
if self._stack:
await self._stack.aclose()
self._stack = None
self.session = None
self._tools_cache = None
async def list_tools(self) -> List[Dict[str, Any]]:
"""Return tool definitions (cached)."""
if self._tools_cache is not None:
return self._tools_cache
if not self.session:
raise RuntimeError("MCP HTTP client not started")
resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout)
self._tools_cache = [t.model_dump() for t in resp.tools]
return self._tools_cache
async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any:
"""Invoke a remote tool and return the structured result."""
if not self.session:
raise RuntimeError("MCP HTTP client not started")
result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout)
return result.model_dump()
================================================
FILE: src/agents/mcp/stdio_server.py
================================================
"""
Minimal MCP Stdio Server Implementation
========================================
Provides stdio-based MCP server communication for services like
Notion, Filesystem, Playwright, and Postgres.
"""
import asyncio
import os
from contextlib import AsyncExitStack
from typing import Any, Dict, List, Optional
from mcp import ClientSession, StdioServerParameters
from mcp.client.stdio import stdio_client
class MCPStdioServer:
"""Lightweight wrapper around the official MCP Python SDK."""
def __init__(self, command: str, args: List[str], env: Optional[Dict[str, str]] = None, timeout: int = 120):
self.params = StdioServerParameters(command=command, args=args, env={**os.environ, **(env or {})})
self.timeout = timeout
self._stack: Optional[AsyncExitStack] = None
self._streams = None
self.session: Optional[ClientSession] = None
async def __aenter__(self):
self._stack = AsyncExitStack()
read, write = await self._stack.enter_async_context(stdio_client(self.params))
self.session = await self._stack.enter_async_context(ClientSession(read, write))
await asyncio.wait_for(self.session.initialize(), timeout=self.timeout)
return self
async def __aexit__(self, exc_type, exc, tb):
if self._stack:
await self._stack.aclose()
self._stack = None
self.session = None
async def list_tools(self) -> List[Dict[str, Any]]:
resp = await asyncio.wait_for(self.session.list_tools(), timeout=self.timeout)
return [t.model_dump() for t in resp.tools]
async def call_tool(self, name: str, arguments: Dict[str, Any]) -> Any:
result = await asyncio.wait_for(self.session.call_tool(name, arguments), timeout=self.timeout)
return result.model_dump() # 同上,转成 dict
================================================
FILE: src/agents/mcpmark_agent.py
================================================
"""
MCPMark Agent Implementation
============================
Unified agent using LiteLLM for all model interactions with minimal MCP support.
"""
import asyncio
import json
import time
from typing import Any, Dict, List, Optional, Callable
from pydantic import AnyUrl
import httpx
import litellm
import nest_asyncio
from src.logger import get_logger
from .base_agent import BaseMCPAgent
from .mcp import MCPStdioServer, MCPHttpServer
# Apply nested asyncio support
nest_asyncio.apply()
# Configure LiteLLM
litellm.suppress_debug_info = True
logger = get_logger(__name__)
# To fix the "Object of type AnyUrl is not JSON serializable" error in the find_file_contents function.
class CustomJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, AnyUrl):
return str(obj)
return super().default(obj)
class MCPMarkAgent(BaseMCPAgent):
"""
Unified agent for LLM and MCP server management using LiteLLM.
- Anthropic models: Native MCP support via extra_body
- Other models: Manual MCP server management with function calling
"""
MAX_TURNS = 100
SYSTEM_PROMPT = (
"You are a helpful agent that uses tools iteratively to complete the user's task, "
'and when finished, provides the final answer or simply states "Task completed" without further tool calls.'
)
COMPACTION_PROMPT = (
"You are performing a CONTEXT CHECKPOINT COMPACTION.\n"
"Summarize the conversation so far for another model to continue.\n\n"
"Include:\n"
"- Current progress and key decisions made\n"
"- Important context, constraints, or user preferences\n"
"- What remains to be done (clear next steps)\n"
"- Any critical data, examples, or references needed to continue\n\n"
"Be concise and structured. Do NOT call tools."
)
DEFAULT_TIMEOUT = BaseMCPAgent.DEFAULT_TIMEOUT
def __init__(
self,
litellm_input_model_name: str,
api_key: str,
base_url: str,
mcp_service: str,
timeout: int = DEFAULT_TIMEOUT,
service_config: Optional[Dict[str, Any]] = None,
service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None,
reasoning_effort: Optional[str] = "default",
compaction_token: int = BaseMCPAgent.COMPACTION_DISABLED_TOKEN,
):
super().__init__(
litellm_input_model_name=litellm_input_model_name,
api_key=api_key,
base_url=base_url,
mcp_service=mcp_service,
timeout=timeout,
service_config=service_config,
service_config_provider=service_config_provider,
reasoning_effort=reasoning_effort,
compaction_token=compaction_token,
)
logger.debug(
"Initialized MCPMarkAgent for '%s' with model '%s' (Claude: %s, Thinking: %s, Reasoning: %s)",
mcp_service,
litellm_input_model_name,
self.is_claude,
self.use_claude_thinking,
reasoning_effort,
)
# ==================== Public Interface Methods ====================
async def execute(
self, instruction: str, tool_call_log_file: Optional[str] = None
) -> Dict[str, Any]:
"""
Execute instruction with the agent.
Args:
instruction: The instruction/prompt to execute
tool_call_log_file: Optional path to log tool calls
Returns:
Dictionary containing execution results
"""
start_time = time.time()
try:
# Reset partial progress for this run
self._reset_progress()
# Refresh service configuration
self._refresh_service_config()
# Execute with timeout control
async def _execute_with_strategy():
if self.use_claude_thinking:
# Claude with thinking -> native Anthropic API with tools
return await self._execute_claude_native_with_tools(
instruction, tool_call_log_file
)
else:
# All other cases -> LiteLLM with tools
return await self._execute_litellm_with_tools(
instruction, tool_call_log_file
)
# Apply timeout to the entire execution
result = await asyncio.wait_for(
_execute_with_strategy(), timeout=self.timeout
)
execution_time = time.time() - start_time
# Update usage statistics
self.usage_tracker.update(
success=result["success"],
token_usage=result.get("token_usage", {}),
turn_count=result.get("turn_count", 0),
execution_time=execution_time,
)
result["execution_time"] = execution_time
return result
except Exception as e:
execution_time = time.time() - start_time
if isinstance(e, asyncio.TimeoutError):
error_msg = f"Execution timed out after {self.timeout} seconds"
logger.error(error_msg)
else:
error_msg = f"Agent execution failed: {e}"
logger.error(error_msg, exc_info=True)
self.usage_tracker.update(
success=False,
token_usage=self._partial_token_usage or {},
turn_count=self._partial_turn_count or 0,
execution_time=execution_time,
)
if self._partial_messages:
if not self.is_claude:
final_msg = self._convert_to_sdk_format(self._partial_messages)
else:
final_msg = self._partial_messages
else:
final_msg = []
return {
"success": False,
"output": final_msg,
"token_usage": self._partial_token_usage or {},
"turn_count": self._partial_turn_count or 0,
"execution_time": execution_time,
"error": error_msg,
"litellm_run_model_name": self.litellm_run_model_name,
}
def execute_sync(
self, instruction: str, tool_call_log_file: Optional[str] = None
) -> Dict[str, Any]:
"""
Synchronous wrapper for execute method.
"""
return asyncio.run(self.execute(instruction, tool_call_log_file))
def get_usage_stats(self) -> Dict[str, Any]:
"""Get usage statistics."""
return self.usage_tracker.get_stats()
def reset_usage_stats(self):
"""Reset usage statistics."""
self.usage_tracker.reset()
# ==================== Claude Native API Execution Path ====================
async def _execute_claude_native_with_tools(
self, instruction: str, tool_call_log_file: Optional[str] = None
) -> Dict[str, Any]:
"""
Execute Claude with thinking using native Anthropic API.
Creates MCP server, gets tools, and executes with thinking.
"""
logger.debug("Using Claude native API with thinking")
thinking_budget = self._get_claude_thinking_budget()
# Create and start MCP server
mcp_server = await self._create_mcp_server()
async with mcp_server:
# Get available tools
tools = await mcp_server.list_tools()
# Convert MCP tools to Anthropic format
anthropic_tools = self._convert_to_anthropic_format(tools)
# Execute with function calling loop
return await self._execute_anthropic_native_tool_loop(
instruction,
anthropic_tools,
mcp_server,
thinking_budget,
tool_call_log_file,
)
async def _call_claude_native_api(
self,
messages: List[Dict],
thinking_budget: int,
tools: Optional[List[Dict]] = None,
mcp_servers: Optional[List[Dict]] = None,
system: Optional[str] = None,
) -> Dict[str, Any]:
"""
Call Claude's native API directly using httpx.
Args:
messages: Conversation messages
thinking_budget: Token budget for thinking
tools: Tool definitions for function calling
mcp_servers: MCP server configurations
system: System prompt
Returns:
API response as dictionary
"""
# Get API base and headers
import os
api_base = os.getenv("ANTHROPIC_API_BASE", "https://api.anthropic.com")
headers = {
"x-api-key": self.api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
"anthropic-beta": "context-1m-2025-08-07", # by default
}
# Build payload
max_tokens = max(thinking_budget + 4096, 4096)
payload = {
"model": self.litellm_input_model_name.replace("anthropic/", ""),
"max_tokens": max_tokens,
"messages": messages,
}
# Add thinking configuration
if thinking_budget:
payload["thinking"] = {"type": "enabled", "budget_tokens": thinking_budget}
# Add tools if provided
if tools:
payload["tools"] = tools
payload["tool_choice"] = {"type": "auto"}
# Add MCP servers if provided
if mcp_servers:
headers["anthropic-beta"] = "mcp-client-2025-04-04"
payload["mcp_servers"] = mcp_servers
# Add system prompt if provided
if system:
payload["system"] = system
# Make the API call
async with httpx.AsyncClient() as client:
try:
response = await client.post(
f"{api_base}/v1/messages",
headers=headers,
json=payload,
timeout=self.timeout,
)
response.raise_for_status()
return response.json(), None
except httpx.HTTPStatusError as e:
return None, e.response.text
except Exception as e:
return None, e
async def _count_claude_input_tokens(
self,
messages: List[Dict[str, Any]],
tools: Optional[List[Dict]] = None,
system: Optional[str] = None,
) -> int:
import os
api_base = os.getenv("ANTHROPIC_API_BASE", "https://api.anthropic.com")
headers = {
"x-api-key": self.api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload: Dict[str, Any] = {
"model": self.litellm_input_model_name.replace("anthropic/", ""),
"messages": messages,
}
if tools:
payload["tools"] = tools
if system:
payload["system"] = system
async with httpx.AsyncClient() as client:
response = await client.post(
f"{api_base}/v1/messages/count_tokens",
headers=headers,
json=payload,
timeout=self.timeout,
)
response.raise_for_status()
data = response.json() or {}
return int(data.get("input_tokens", 0) or 0)
def _extract_litellm_text(self, response: Any) -> str:
try:
choices = getattr(response, "choices", None) or []
if not choices:
return ""
msg = getattr(choices[0], "message", None)
if msg is not None:
return str(getattr(msg, "content", "") or "")
return str(getattr(choices[0], "text", "") or "")
except Exception: # pragma: no cover - best effort
return ""
def _extract_anthropic_text(self, response_json: Dict[str, Any]) -> str:
pieces: List[str] = []
for block in response_json.get("content", []) or []:
if isinstance(block, dict) and block.get("type") == "text":
text = block.get("text")
if text:
pieces.append(str(text))
return "\n".join(pieces).strip()
def _merge_usage(self, total_tokens: Dict[str, int], usage: Dict[str, Any]) -> None:
try:
input_tokens = int(usage.get("input_tokens", 0) or 0)
output_tokens = int(usage.get("output_tokens", 0) or 0)
total_tokens_count = int(
usage.get("total_tokens", 0) or (input_tokens + output_tokens)
)
total_tokens["input_tokens"] += input_tokens
total_tokens["output_tokens"] += output_tokens
total_tokens["total_tokens"] += total_tokens_count
except Exception: # pragma: no cover - best effort
return
async def _maybe_compact_litellm_messages(
self,
messages: List[Dict[str, Any]],
total_tokens: Dict[str, int],
tool_call_log_file: Optional[str],
current_prompt_tokens: int,
) -> List[Dict[str, Any]]:
if not self._compaction_enabled():
return messages
if current_prompt_tokens < self.compaction_token:
return messages
logger.info(
f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}"
)
if tool_call_log_file:
try:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(
f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}\n"
)
except Exception:
pass
compact_messages = [
{"role": "system", "content": self.COMPACTION_PROMPT},
{"role": "user", "content": json.dumps(messages, ensure_ascii=False)},
]
completion_kwargs = {
"model": self.litellm_input_model_name,
"messages": compact_messages,
"api_key": self.api_key,
}
if self.base_url:
completion_kwargs["base_url"] = self.base_url
response = await litellm.acompletion(**completion_kwargs)
usage = getattr(response, "usage", None)
if usage:
input_tokens = (
getattr(usage, "prompt_tokens", None)
or getattr(usage, "input_tokens", None)
or 0
)
output_tokens = (
getattr(usage, "completion_tokens", None)
or getattr(usage, "output_tokens", None)
or 0
)
total_tokens_count = getattr(usage, "total_tokens", None)
if total_tokens_count is None:
total_tokens_count = input_tokens + output_tokens
total_tokens["input_tokens"] += int(input_tokens or 0)
total_tokens["output_tokens"] += int(output_tokens or 0)
total_tokens["total_tokens"] += int(total_tokens_count or 0)
summary = self._extract_litellm_text(response).strip() or "(no summary)"
system_msg = (
messages[0]
if messages
else {"role": "system", "content": self.SYSTEM_PROMPT}
)
first_user = (
messages[1] if len(messages) > 1 else {"role": "user", "content": ""}
)
return [
system_msg,
first_user,
{
"role": "user",
"content": f"Context summary (auto-compacted due to token limit):\n{summary}",
},
]
async def _maybe_compact_anthropic_messages(
self,
messages: List[Dict[str, Any]],
total_tokens: Dict[str, int],
thinking_budget: int,
tool_call_log_file: Optional[str],
current_input_tokens: int,
) -> List[Dict[str, Any]]:
if not self._compaction_enabled():
return messages
if current_input_tokens < self.compaction_token:
return messages
logger.info(
f"| [compaction] Triggered at input tokens: {current_input_tokens:,}"
)
if tool_call_log_file:
try:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(
f"| [compaction] Triggered at input tokens: {current_input_tokens:,}\n"
)
except Exception:
pass
compact_messages = [
{"role": "user", "content": self.COMPACTION_PROMPT},
{"role": "user", "content": json.dumps(messages, ensure_ascii=False)},
]
response, error_msg = await self._call_claude_native_api(
messages=compact_messages,
thinking_budget=thinking_budget,
tools=None,
system=None,
)
if error_msg or not response:
logger.warning(f"| [compaction] Failed: {error_msg}")
return messages
usage = response.get("usage", {}) or {}
input_tokens = usage.get("input_tokens", 0) or 0
output_tokens = usage.get("output_tokens", 0) or 0
total_tokens["input_tokens"] += int(input_tokens)
total_tokens["output_tokens"] += int(output_tokens)
total_tokens["total_tokens"] += int(input_tokens + output_tokens)
summary = self._extract_anthropic_text(response) or "(no summary)"
first_user = messages[0] if messages else {"role": "user", "content": ""}
return [
first_user,
{
"role": "user",
"content": f"Context summary (auto-compacted due to token limit):\n{summary}",
},
]
async def _execute_anthropic_native_tool_loop(
self,
instruction: str,
tools: List[Dict],
mcp_server: Any,
thinking_budget: int,
tool_call_log_file: Optional[str] = None,
) -> Dict[str, Any]:
"""
Execute Claude thinking loop with function calling.
Handles thinking blocks, tool calls, and message formatting.
"""
messages = [{"role": "user", "content": instruction}]
total_tokens = {
"input_tokens": 0,
"output_tokens": 0,
"total_tokens": 0,
"reasoning_tokens": 0,
}
turn_count = 0
max_turns = self.MAX_TURNS
hit_turn_limit = False
ended_normally = False
system_text = self.SYSTEM_PROMPT
# Record initial state
self._update_progress(messages, total_tokens, turn_count)
for _ in range(max_turns):
turn_count += 1
current_input_tokens = 0
if self._compaction_enabled():
try:
current_input_tokens = await self._count_claude_input_tokens(
messages=messages,
tools=tools,
system=system_text,
)
except Exception as exc: # noqa: BLE001
logger.debug("Claude token counting failed: %s", exc)
messages = await self._maybe_compact_anthropic_messages(
messages=messages,
total_tokens=total_tokens,
thinking_budget=thinking_budget,
tool_call_log_file=tool_call_log_file,
current_input_tokens=current_input_tokens,
)
self._update_progress(messages, total_tokens, turn_count)
# Call Claude native API
response, error_msg = await self._call_claude_native_api(
messages=messages,
thinking_budget=thinking_budget,
tools=tools,
system=system_text,
)
if turn_count == 1:
self.litellm_run_model_name = response["model"].split("/")[-1]
if error_msg:
break
# Update token usage
if "usage" in response:
usage = response["usage"]
input_tokens = usage.get("input_tokens", 0)
output_tokens = usage.get("output_tokens", 0)
# Calculate output tokens as total - input for consistency
total_tokens_count = output_tokens + input_tokens
total_tokens["input_tokens"] += input_tokens
total_tokens["output_tokens"] += output_tokens
total_tokens["total_tokens"] += total_tokens_count
## TODO: add reasoning tokens for claude
# Extract blocks from response
blocks = response.get("content", [])
tool_uses = [b for b in blocks if b.get("type") == "tool_use"]
thinking_blocks = [b for b in blocks if b.get("type") == "thinking"]
text_blocks = [b for b in blocks if b.get("type") == "text"]
# Log text output
for tb in text_blocks:
if tb.get("text") and tool_call_log_file:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(f"{tb['text']}\n")
if tb.get("text"):
for line in tb["text"].splitlines():
logger.info(f"| {line}")
# Build assistant message with all blocks
assistant_content = []
# Add thinking blocks
for tb in thinking_blocks:
assistant_content.append(
{
"type": "thinking",
"thinking": tb.get("thinking", ""),
"signature": tb.get("signature", ""),
}
)
# Add text blocks
for tb in text_blocks:
if tb.get("text"):
assistant_content.append({"type": "text", "text": tb["text"]})
# Add tool_use blocks
for tu in tool_uses:
assistant_content.append(
{
"type": "tool_use",
"id": tu.get("id"),
"name": tu.get("name"),
"input": tu.get("input", {}),
}
)
messages.append({"role": "assistant", "content": assistant_content})
# Update partial progress after assistant response
self._update_progress(messages, total_tokens, turn_count)
# If no tool calls, we're done
if not tool_uses:
ended_normally = True
break
# Execute tools and add results
tool_results = []
for tu in tool_uses:
name = tu.get("name")
inputs = tu.get("input", {})
# Log tool call
args_str = json.dumps(inputs, separators=(",", ": "))
display_args = (
args_str[:140] + "..." if len(args_str) > 140 else args_str
)
logger.info(f"| \033[1m{name}\033[0m \033[2;37m{display_args}\033[0m")
if tool_call_log_file:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(f"| {name} {args_str}\n")
# Execute tool
try:
result = await asyncio.wait_for(
mcp_server.call_tool(name, inputs), timeout=60
)
tool_results.append(
{
"type": "tool_result",
"tool_use_id": tu["id"],
"content": [
{
"type": "text",
"text": json.dumps(result, cls=CustomJSONEncoder),
}
],
}
)
except Exception as e:
logger.error(f"Tool call failed: {e}")
tool_results.append(
{
"type": "tool_result",
"tool_use_id": tu["id"],
"content": [{"type": "text", "text": f"Error: {str(e)}"}],
}
)
messages.append({"role": "user", "content": tool_results})
# Update partial progress after tool results
self._update_progress(messages, total_tokens, turn_count)
# Detect if we exited due to hitting the turn limit
if not ended_normally:
if turn_count >= max_turns:
hit_turn_limit = True
logger.warning(
f"| Max turns ({max_turns}) exceeded; returning failure with partial output."
)
if tool_call_log_file:
try:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(f"| Max turns ({max_turns}) exceeded\n")
except Exception:
pass
elif error_msg:
logger.warning(f"| {error_msg}\n")
if tool_call_log_file:
try:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(f"| {error_msg}\n")
except Exception:
pass
# Display final token usage
if total_tokens["total_tokens"] > 0:
log_msg = (
f"|\n| Token usage: Total: {total_tokens['total_tokens']:,} | "
f"Input: {total_tokens['input_tokens']:,} | "
f"Output: {total_tokens['output_tokens']:,}"
)
if total_tokens.get("reasoning_tokens", 0) > 0:
log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}"
logger.info(log_msg)
logger.info(f"| Turns: {turn_count}")
# Convert messages to SDK format
sdk_format_messages = self._convert_to_sdk_format(messages)
if hit_turn_limit:
return {
"success": False,
"output": sdk_format_messages,
"token_usage": total_tokens,
"turn_count": turn_count,
"error": f"Max turns ({max_turns}) exceeded",
"litellm_run_model_name": self.litellm_run_model_name,
}
if error_msg:
return {
"success": False,
"output": sdk_format_messages,
"token_usage": total_tokens,
"turn_count": turn_count,
"error": error_msg,
"litellm_run_model_name": self.litellm_run_model_name,
}
return {
"success": True,
"output": sdk_format_messages,
"token_usage": total_tokens,
"turn_count": turn_count,
"error": None,
"litellm_run_model_name": self.litellm_run_model_name,
}
# ==================== LiteLLM Execution Path ====================
async def _execute_litellm_with_tools(
self, instruction: str, tool_call_log_file: Optional[str] = None
) -> Dict[str, Any]:
"""
Execute with manual MCP server management.
Used for all non-Anthropic models and Anthropic models with STDIO services.
"""
logger.debug("Using manual MCP execution with function calling loop")
# Create and start MCP server
mcp_server = await self._create_mcp_server()
try:
async with mcp_server:
# Get available tools
tools = await mcp_server.list_tools()
# Convert MCP tools to OpenAI function format
functions = self._convert_to_openai_format(tools)
# Execute with function calling loop
return await self._execute_litellm_tool_loop(
instruction, functions, mcp_server, tool_call_log_file
)
except Exception as e:
logger.error(f"Manual MCP execution failed: {e}")
raise
async def _execute_litellm_tool_loop(
self,
instruction: str,
functions: List[Dict],
mcp_server: Any,
tool_call_log_file: Optional[str] = None,
) -> Dict[str, Any]:
"""Execute function calling loop with LiteLLM."""
messages = [
{"role": "system", "content": self.SYSTEM_PROMPT},
{"role": "user", "content": instruction},
]
total_tokens = {
"input_tokens": 0,
"output_tokens": 0,
"total_tokens": 0,
"reasoning_tokens": 0,
}
turn_count = 0
max_turns = self.MAX_TURNS # Limit turns to prevent infinite loops
consecutive_failures = 0
max_consecutive_failures = 3
hit_turn_limit = False
ended_normally = False
# Convert functions to tools format for newer models
tools = (
[{"type": "function", "function": func} for func in functions]
if functions
else None
)
if tool_call_log_file and tools:
max_name_length = (
max(len(tool.get("function", {}).get("name", "")) for tool in tools)
if tools
else 15
)
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write("===== Available Tools =====\n")
for tool in tools:
function_info = tool.get("function", {})
tool_name = function_info.get("name", "N/A")
description = function_info.get("description", "N/A")
f.write(
f"- ToolName: {tool_name:<{max_name_length}} Description: {description}\n"
)
f.write("\n===== Execution Logs =====\n")
# Record initial state
self._update_progress(messages, total_tokens, turn_count)
try:
while turn_count < max_turns:
current_prompt_tokens = 0
if self._compaction_enabled():
current_prompt_tokens = self._count_prompt_tokens_litellm(messages)
messages = await self._maybe_compact_litellm_messages(
messages=messages,
total_tokens=total_tokens,
tool_call_log_file=tool_call_log_file,
current_prompt_tokens=current_prompt_tokens,
)
self._update_progress(messages, total_tokens, turn_count)
# Build completion kwargs
completion_kwargs = {
"model": self.litellm_input_model_name,
"messages": messages,
"api_key": self.api_key,
}
# Always use tools format if available - LiteLLM will handle conversion
if tools:
completion_kwargs["tools"] = tools
completion_kwargs["tool_choice"] = "auto"
# Add reasoning_effort and base_url if specified
if self.reasoning_effort != "default":
completion_kwargs["reasoning_effort"] = self.reasoning_effort
if self.base_url:
completion_kwargs["base_url"] = self.base_url
try:
# Call LiteLLM with timeout for individual call
response = await asyncio.wait_for(
litellm.acompletion(**completion_kwargs),
timeout=self.timeout / 2, # Use half of total timeout
)
consecutive_failures = 0 # Reset failure counter on success
except asyncio.TimeoutError:
logger.warning(f"| ✗ LLM call timed out on turn {turn_count + 1}")
consecutive_failures += 1
if consecutive_failures >= max_consecutive_failures:
raise Exception(
f"Too many consecutive failures ({consecutive_failures})"
)
await asyncio.sleep(8**consecutive_failures) # Exponential backoff
continue
except Exception as e:
logger.error(f"| ✗ LLM call failed on turn {turn_count + 1}: {e}")
consecutive_failures += 1
if consecutive_failures >= max_consecutive_failures:
raise
if "ContextWindowExceededError" in str(e):
# Best-effort fallback: compact and retry once.
messages = await self._maybe_compact_litellm_messages(
messages=messages,
total_tokens=total_tokens,
tool_call_log_file=tool_call_log_file,
current_prompt_tokens=self.compaction_token,
)
self._update_progress(messages, total_tokens, turn_count)
continue
elif "RateLimitError" in str(e):
await asyncio.sleep(12**consecutive_failures)
else:
await asyncio.sleep(2**consecutive_failures)
continue
# Extract actual model name from response (first turn only)
if turn_count == 0 and hasattr(response, "model") and response.model:
self.litellm_run_model_name = response.model.split("/")[-1]
# Update token usage including reasoning tokens
if hasattr(response, "usage") and response.usage:
input_tokens = response.usage.prompt_tokens or 0
total_tokens_count = response.usage.total_tokens or 0
# Calculate output tokens as total - input for consistency
output_tokens = (
total_tokens_count - input_tokens
if total_tokens_count > 0
else (response.usage.completion_tokens or 0)
)
total_tokens["input_tokens"] += input_tokens
total_tokens["output_tokens"] += output_tokens
total_tokens["total_tokens"] += total_tokens_count
# Extract reasoning tokens if available
if hasattr(response.usage, "completion_tokens_details"):
details = response.usage.completion_tokens_details
if hasattr(details, "reasoning_tokens"):
total_tokens["reasoning_tokens"] += (
details.reasoning_tokens or 0
)
# Get response message
choices = response.choices
if len(choices):
message = choices[0].message
# deeply dump the message to ensure we capture all fields
message_dict = (
message.model_dump()
if hasattr(message, "model_dump")
else dict(message)
)
# Explicitly preserve function_call if present (even if tool_calls exists),
# as it may contain provider-specific metadata (e.g. Gemini thought_signature)
if hasattr(message, "function_call") and message.function_call:
# Ensure it's in the dict if model_dump missed it or it was excluded
if (
"function_call" not in message_dict
or not message_dict["function_call"]
):
fc = message.function_call
message_dict["function_call"] = (
fc.model_dump() if hasattr(fc, "model_dump") else fc
)
# Log assistant's text content if present
if hasattr(message, "content") and message.content:
# Display the content with line prefix
for line in message.content.splitlines():
logger.info(f"| {line}")
# Also log to file if specified
if tool_call_log_file:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(f"{message.content}\n")
# Check for tool calls (newer format)
if hasattr(message, "tool_calls") and message.tool_calls:
messages.append(message_dict)
turn_count += 1
# Update progress after assistant with tool calls
self._update_progress(messages, total_tokens, turn_count)
# Process tool calls
for tool_call in message.tool_calls:
func_name = tool_call.function.name
func_args = json.loads(tool_call.function.arguments)
try:
result = await asyncio.wait_for(
mcp_server.call_tool(func_name, func_args), timeout=60
)
messages.append(
{
"role": "tool",
"tool_call_id": tool_call.id,
"content": json.dumps(
result, cls=CustomJSONEncoder
),
}
)
except asyncio.TimeoutError:
error_msg = (
f"Tool call '{func_name}' timed out after 60 seconds"
)
logger.error(error_msg)
messages.append(
{
"role": "tool",
"tool_call_id": tool_call.id,
"content": f"Error: {error_msg}",
}
)
except Exception as e:
logger.error(f"Tool call failed: {e}")
messages.append(
{
"role": "tool",
"tool_call_id": tool_call.id,
"content": f"Error: {str(e)}",
}
)
# Format arguments for display (truncate if too long)
args_str = json.dumps(func_args, separators=(",", ": "))
display_arguments = (
args_str[:140] + "..." if len(args_str) > 140 else args_str
)
# Log with ANSI color codes (bold tool name, dim gray arguments)
logger.info(
f"| \033[1m{func_name}\033[0m \033[2;37m{display_arguments}\033[0m"
)
if tool_call_log_file:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(f"| {func_name} {args_str}\n")
# Update progress after tool results appended
self._update_progress(messages, total_tokens, turn_count)
continue
else:
# Log end reason
if not choices:
logger.info(
"|\n|\n| Task ended with no messages generated by the model."
)
elif choices[0].finish_reason == "stop":
logger.info(
"|\n|\n| Task ended with the finish reason from messages being 'stop'."
)
# No tool/function call, add message and we're done
messages.append(message_dict)
turn_count += 1
# Update progress before exiting
self._update_progress(messages, total_tokens, turn_count)
ended_normally = True
break
except Exception as loop_error:
# On any error, return partial conversation, token usage, and turn count
logger.error(f"Manual MCP loop failed: {loop_error}", exc_info=True)
sdk_format_messages = self._convert_to_sdk_format(messages)
return {
"success": False,
"output": sdk_format_messages,
"token_usage": total_tokens,
"turn_count": turn_count,
"error": str(loop_error),
"litellm_run_model_name": self.litellm_run_model_name,
}
# Detect if we exited due to hitting the turn limit
if (not ended_normally) and (turn_count >= max_turns):
hit_turn_limit = True
logger.warning(
f"| Max turns ({max_turns}) exceeded); returning failure with partial output."
)
if tool_call_log_file:
try:
with open(tool_call_log_file, "a", encoding="utf-8") as f:
f.write(f"| Max turns ({max_turns}) exceeded\n")
except Exception:
pass
# Display final token usage
if total_tokens["total_tokens"] > 0:
log_msg = (
f"| Token usage: Total: {total_tokens['total_tokens']:,} | "
f"Input: {total_tokens['input_tokens']:,} | "
f"Output: {total_tokens['output_tokens']:,}"
)
if total_tokens.get("reasoning_tokens", 0) > 0:
log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}"
logger.info(log_msg)
logger.info(f"| Turns: {turn_count}")
# Convert messages to SDK format for backward compatibility
sdk_format_messages = self._convert_to_sdk_format(messages)
return {
"success": not hit_turn_limit,
"output": sdk_format_messages,
"token_usage": total_tokens,
"turn_count": turn_count,
"error": (f"Max turns ({max_turns}) exceeded" if hit_turn_limit else None),
"litellm_run_model_name": self.litellm_run_model_name,
}
# ==================== MCP Server Management ====================
async def _create_mcp_server(self) -> Any:
"""Create and return an MCP server instance."""
if self.mcp_service in self.STDIO_SERVICES:
return self._create_stdio_server()
elif self.mcp_service in self.HTTP_SERVICES:
return self._create_http_server()
else:
raise ValueError(f"Unsupported MCP service: {self.mcp_service}")
def _create_stdio_server(self) -> MCPStdioServer:
"""Create stdio-based MCP server."""
if self.mcp_service == "notion":
notion_key = self.service_config.get("notion_key")
if not notion_key:
raise ValueError("Notion API key required")
return MCPStdioServer(
command="npx",
args=["-y", "@notionhq/notion-mcp-server@1.9.1"],
env={
"OPENAPI_MCP_HEADERS": (
'{"Authorization": "Bearer ' + notion_key + '", '
'"Notion-Version": "2022-06-28"}'
)
},
)
elif self.mcp_service == "filesystem":
test_directory = self.service_config.get("test_directory")
if not test_directory:
raise ValueError("Test directory required for filesystem service")
return MCPStdioServer(
command="npx",
args=[
"-y",
"@modelcontextprotocol/server-filesystem",
str(test_directory),
],
)
elif self.mcp_service in ["playwright", "playwright_webarena"]:
browser = self.service_config.get("browser", "chromium")
headless = self.service_config.get("headless", True)
viewport_width = self.service_config.get("viewport_width", 1280)
viewport_height = self.service_config.get("viewport_height", 720)
args = ["-y", "@playwright/mcp@latest"]
if headless:
args.append("--headless")
args.extend(
[
"--isolated",
"--no-sandbox",
"--browser",
browser,
"--viewport-size",
f"{viewport_width},{viewport_height}",
]
)
return MCPStdioServer(command="npx", args=args)
elif self.mcp_service == "postgres":
host = self.service_config.get("host", "localhost")
port = self.service_config.get("port", 5432)
username = self.service_config.get("username")
password = self.service_config.get("password")
database = self.service_config.get(
"current_database"
) or self.service_config.get("database")
if not all([username, password, database]):
raise ValueError("PostgreSQL requires username, password, and database")
database_url = (
f"postgresql://{username}:{password}@{host}:{port}/{database}"
)
return MCPStdioServer(
command="pipx",
args=["run", "postgres-mcp", "--access-mode=unrestricted"],
env={"DATABASE_URI": database_url},
)
elif self.mcp_service == "insforge":
api_key = self.service_config.get("api_key")
backend_url = self.service_config.get("backend_url")
if not all([api_key, backend_url]):
raise ValueError("Insforge requires api_key and backend_url")
return MCPStdioServer(
command="npx",
args=["-y", "@insforge/mcp@dev"],
env={
"INSFORGE_API_KEY": api_key,
"INSFORGE_BACKEND_URL": backend_url,
},
)
elif self.mcp_service == "github":
github_token = self.service_config.get("github_token")
if not github_token:
raise ValueError("GitHub token required")
return MCPStdioServer(
command="docker",
args=[
"run", "-i", "--rm",
"-e", "GITHUB_PERSONAL_ACCESS_TOKEN",
"ghcr.io/github/github-mcp-server:v0.15.0",
],
env={"GITHUB_PERSONAL_ACCESS_TOKEN": github_token},
)
else:
raise ValueError(f"Unsupported stdio service: {self.mcp_service}")
def _create_http_server(self) -> MCPHttpServer:
"""Create HTTP-based MCP server."""
if self.mcp_service == "supabase":
# Use built-in MCP server from Supabase CLI
api_url = self.service_config.get("api_url", "http://localhost:54321")
api_key = self.service_config.get("api_key", "")
if not api_key:
raise ValueError(
"Supabase requires api_key (use secret key from 'supabase status')"
)
# Supabase CLI exposes MCP at /mcp endpoint
mcp_url = f"{api_url}/mcp"
return MCPHttpServer(
url=mcp_url,
headers={
"apikey": api_key,
"Authorization": f"Bearer {api_key}",
},
)
else:
raise ValueError(f"Unsupported HTTP service: {self.mcp_service}")
================================================
FILE: src/agents/react_agent.py
================================================
"""ReAct agent implementation for the MCPMark pipeline."""
from __future__ import annotations
import asyncio
import json
import time
from typing import Any, Dict, List, Optional, Callable
import litellm
from src.logger import get_logger
from .base_agent import BaseMCPAgent
logger = get_logger(__name__)
class ReActAgent(BaseMCPAgent):
"""ReAct-style agent that reuses MCPMark infrastructure."""
DEFAULT_SYSTEM_PROMPT = (
"You are a careful ReAct (reasoning and acting) agent. "
"At each step you must decide whether to call a tool or provide a final response. "
"Only use the tools that are listed for you. When you finish, respond with either the final answer "
"or the phrase \"Task completed.\" if no further detail is required. "
"Every reply must be valid JSON without code fences."
)
COMPACTION_PROMPT = (
"You are performing a CONTEXT CHECKPOINT COMPACTION.\n"
"Summarize the conversation so far for another model to continue.\n\n"
"Include:\n"
"- Current progress and key decisions made\n"
"- Important context, constraints, or user preferences\n"
"- What remains to be done (clear next steps)\n"
"- Any critical data, examples, or references needed to continue\n\n"
"Be concise and structured. Do NOT call tools."
)
def __init__(
self,
litellm_input_model_name: str,
api_key: str,
base_url: str,
mcp_service: str,
timeout: int = BaseMCPAgent.DEFAULT_TIMEOUT,
service_config: Optional[Dict[str, Any]] = None,
service_config_provider: Optional[Callable[[], Dict[str, Any]]] = None,
reasoning_effort: Optional[str] = "default",
max_iterations: int = 100,
system_prompt: Optional[str] = None,
compaction_token: int = BaseMCPAgent.COMPACTION_DISABLED_TOKEN,
):
super().__init__(
litellm_input_model_name=litellm_input_model_name,
api_key=api_key,
base_url=base_url,
mcp_service=mcp_service,
timeout=timeout,
service_config=service_config,
service_config_provider=service_config_provider,
reasoning_effort=reasoning_effort,
compaction_token=compaction_token,
)
self.max_iterations = max_iterations
self.react_system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT
async def execute(
self,
instruction: str,
tool_call_log_file: Optional[str] = None,
) -> Dict[str, Any]:
start_time = time.time()
try:
self._reset_progress()
self._refresh_service_config()
async def _run_react():
return await self._execute_react_loop(instruction, tool_call_log_file)
result = await asyncio.wait_for(_run_react(), timeout=self.timeout)
execution_time = time.time() - start_time
self.usage_tracker.update(
success=result.get("success", False),
token_usage=result.get("token_usage", {}),
turn_count=result.get("turn_count", 0),
execution_time=execution_time,
)
result["execution_time"] = execution_time
return result
except Exception as exc: # noqa: BLE001
execution_time = time.time() - start_time
if isinstance(exc, asyncio.TimeoutError):
error_msg = f"Execution timed out after {self.timeout} seconds"
logger.error(error_msg)
else:
error_msg = f"ReAct agent execution failed: {exc}"
logger.error(error_msg, exc_info=True)
self.usage_tracker.update(
success=False,
token_usage=self._partial_token_usage or {},
turn_count=self._partial_turn_count or 0,
execution_time=execution_time,
)
if self._partial_messages:
final_msg = self._convert_to_sdk_format(self._partial_messages)
else:
final_msg = []
return {
"success": False,
"output": final_msg,
"token_usage": self._partial_token_usage or {},
"turn_count": self._partial_turn_count or 0,
"execution_time": execution_time,
"error": error_msg,
"litellm_run_model_name": self.litellm_run_model_name,
}
async def _execute_react_loop(
self,
instruction: str,
tool_call_log_file: Optional[str],
) -> Dict[str, Any]:
system_message = {"role": "system", "content": self.react_system_prompt}
total_tokens = {
"input_tokens": 0,
"output_tokens": 0,
"total_tokens": 0,
"reasoning_tokens": 0,
}
turn_count = 0
success = False
final_error: Optional[str] = None
mcp_server = await self._create_mcp_server()
async with mcp_server:
tools = await mcp_server.list_tools()
tool_map = {tool.get("name"): tool for tool in tools}
tools_description = self._render_tools_description(tools)
task_message = {
"role": "user",
"content": self._build_task_prompt(
instruction=instruction,
tools_description=tools_description,
),
}
messages: List[Dict[str, Any]] = [system_message, task_message]
self._update_progress(messages, total_tokens, turn_count)
for step in range(1, self.max_iterations + 1):
current_prompt_tokens = 0
if self._compaction_enabled():
current_prompt_tokens = self._count_prompt_tokens_litellm(messages)
if self._compaction_enabled() and current_prompt_tokens >= self.compaction_token:
logger.info(
f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}"
)
if tool_call_log_file:
try:
with open(tool_call_log_file, "a", encoding="utf-8") as log_file:
log_file.write(
f"| [compaction] Triggered at prompt tokens: {current_prompt_tokens:,}\n"
)
except Exception: # noqa: BLE001
pass
compact_messages = [
{"role": "system", "content": self.COMPACTION_PROMPT},
{"role": "user", "content": json.dumps(messages, ensure_ascii=False)},
]
compact_kwargs = {
"model": self.litellm_input_model_name,
"messages": compact_messages,
"api_key": self.api_key,
}
if self.base_url:
compact_kwargs["base_url"] = self.base_url
compact_response = await litellm.acompletion(**compact_kwargs)
usage = getattr(compact_response, "usage", None)
if usage:
prompt_tokens = (
getattr(usage, "prompt_tokens", None)
or getattr(usage, "input_tokens", None)
or 0
)
completion_tokens = (
getattr(usage, "completion_tokens", None)
or getattr(usage, "output_tokens", None)
or 0
)
total_tokens_count = getattr(usage, "total_tokens", None)
if total_tokens_count is None:
total_tokens_count = prompt_tokens + completion_tokens
total_tokens["input_tokens"] += int(prompt_tokens or 0)
total_tokens["output_tokens"] += int(completion_tokens or 0)
total_tokens["total_tokens"] += int(total_tokens_count or 0)
summary = ""
try:
summary = compact_response.choices[0].message.content or ""
except Exception: # noqa: BLE001
summary = ""
summary = summary.strip() or "(no summary)"
messages = [
system_message,
task_message,
{
"role": "user",
"content": (
"Context summary (auto-compacted due to token limit):\n"
f"{summary}"
),
},
]
self._update_progress(messages, total_tokens, turn_count)
completion_kwargs = {
"model": self.litellm_input_model_name,
"messages": messages,
"api_key": self.api_key,
}
if self.base_url:
completion_kwargs["base_url"] = self.base_url
if self.reasoning_effort != "default":
completion_kwargs["reasoning_effort"] = self.reasoning_effort
try:
response = await asyncio.wait_for(
litellm.acompletion(**completion_kwargs),
timeout=self.timeout / 2,
)
except asyncio.TimeoutError:
final_error = f"LLM call timed out on step {step}"
logger.error(final_error)
break
except Exception as exc: # noqa: BLE001
final_error = f"LLM call failed on step {step}: {exc}"
logger.error(final_error)
if "ContextWindowExceededError" in str(exc):
continue
break
if turn_count == 0 and getattr(response, "model", None):
self.litellm_run_model_name = response.model.split("/")[-1]
usage = getattr(response, "usage", None)
if usage:
prompt_tokens = (
getattr(usage, "prompt_tokens", None)
or getattr(usage, "input_tokens", None)
or 0
)
completion_tokens = (
getattr(usage, "completion_tokens", None)
or getattr(usage, "output_tokens", None)
or 0
)
total_tokens_count = getattr(usage, "total_tokens", None)
if total_tokens_count is None:
total_tokens_count = prompt_tokens + completion_tokens
total_tokens["input_tokens"] += prompt_tokens
total_tokens["output_tokens"] += completion_tokens
total_tokens["total_tokens"] += total_tokens_count
# Extract reasoning tokens if available
if hasattr(response.usage, 'completion_tokens_details'):
details = response.usage.completion_tokens_details
if hasattr(details, 'reasoning_tokens'):
total_tokens["reasoning_tokens"] += details.reasoning_tokens or 0
choice = response.choices[0]
message_obj = getattr(choice, "message", None)
if message_obj is None and isinstance(choice, dict):
message_obj = choice.get("message")
if message_obj is None:
content_raw = getattr(choice, "text", "")
else:
content_raw = message_obj.get("content", "")
assistant_text = self._normalize_content(content_raw)
assistant_message = {"role": "assistant", "content": assistant_text}
messages.append(assistant_message)
turn_count += 1
self._update_progress(messages, total_tokens, turn_count)
parsed = self._parse_react_response(assistant_text)
if not parsed or "thought" not in parsed:
warning = (
"The previous response was not valid JSON following the required schema. "
"Please respond again using the JSON formats provided."
)
messages.append({"role": "user", "content": warning})
self._update_progress(messages, total_tokens, turn_count)
final_error = "Model produced an invalid response format."
continue
thought = parsed.get("thought", "")
action = parsed.get("action")
answer = parsed.get("answer")
result = parsed.get("result")
logger.info(f"|\n| \033[1;3mThought\033[0m: {str(thought)}")
if tool_call_log_file:
try:
with open(tool_call_log_file, "a", encoding="utf-8") as log_file:
log_file.write(f"| {str(thought)}\n")
except Exception: # noqa: BLE001
pass
if action is not None:
func_name = action.get("tool")
arguments = action.get("arguments", {}) or {}
args_str = json.dumps(arguments, separators=(",", ": "))
display_arguments = args_str[:140] + "..." if len(args_str) > 140 else args_str
logger.info(f"| \033[1;3mAction\033[0m: \033[1m{func_name}\033[0m \033[2;37m{display_arguments}\033[0m")
if answer is not None:
success = True
break
if action is not None and isinstance(action, dict):
tool_name = action.get("tool")
arguments = action.get("arguments", {}) or {}
if tool_name not in tool_map:
observation = (
f"Invalid tool '{tool_name}'. Available tools: "
f"{', '.join(tool_map)}"
)
else:
try:
tool_response = await asyncio.wait_for(
mcp_server.call_tool(tool_name, arguments),
timeout=60,
)
observation = self._tool_result_to_text(tool_response)
except asyncio.TimeoutError:
observation = f"Tool '{tool_name}' timed out"
except Exception as tool_exc: # noqa: BLE001
observation = f"Tool '{tool_name}' failed: {tool_exc}"
if tool_call_log_file:
try:
with open(tool_call_log_file, "a", encoding="utf-8") as log_file:
log_file.write(f"| {tool_name} {json.dumps(arguments, ensure_ascii=False)}\n")
except Exception: # noqa: BLE001
pass
observation_message = {
"role": "user",
"content": (
f"Observation:\n{observation}\n"
"Please continue reasoning and reply using the required JSON format."
),
}
messages.append(observation_message)
self._update_progress(messages, total_tokens, turn_count)
continue
if result is not None:
observation_message = {
"role": "user",
"content": (
f"Observation:\n{result}\n"
"Please continue reasoning and reply using the required JSON format."
),
}
messages.append(observation_message)
self._update_progress(messages, total_tokens, turn_count)
continue
# Unexpected structure: ask model to restate properly
messages.append(
{
"role": "user",
"content": (
"The previous reply did not include an action, result, or answer. "
"Please respond again using the JSON formats provided."
),
}
)
self._update_progress(messages, total_tokens, turn_count)
if not success and final_error is None:
final_error = (
f"Max iterations ({self.max_iterations}) reached without a final answer."
)
if total_tokens["total_tokens"] > 0:
log_msg = (
f"|\n|\n| Token usage: Total: {total_tokens['total_tokens']:,} | "
f"Input: {total_tokens['input_tokens']:,} | "
f"Output: {total_tokens['output_tokens']:,}"
)
if total_tokens.get("reasoning_tokens", 0) > 0:
log_msg += f" | Reasoning: {total_tokens['reasoning_tokens']:,}"
logger.info(log_msg)
logger.info(f"| Turns: {turn_count}")
sdk_messages = self._convert_to_sdk_format(messages)
return {
"success": success,
"output": sdk_messages,
"token_usage": total_tokens,
"turn_count": turn_count,
"error": None if success else final_error,
"litellm_run_model_name": self.litellm_run_model_name,
}
def _build_task_prompt(
self,
instruction: str,
tools_description: str,
) -> str:
return (
f"Task:\n{instruction}\n\n"
f"Available MCP tools:\n{tools_description}\n\n"
"Respond using the JSON formats below.\n\n"
"If you need to use a tool:\n"
"{\n"
' "thought": "Reasoning for the next action",\n'
' "action": {\n'
' "tool": "tool-name",\n'
' "arguments": {\n'
' "parameter": value\n'
" }\n"
" }\n"
"}\n\n"
"If you can provide the final answer:\n"
"{\n"
' "thought": "Reasoning that justifies the answer",\n'
' "answer": "Either the final solution or \'Task completed.\' when no more detail is required"\n'
"}\n\n"
"Remember: omitting the action object ends the task, so only do this when finished."
)
def _render_tools_description(self, tools: List[Dict[str, Any]]) -> str:
descriptions = []
for tool in tools:
name = tool.get("name", "unknown")
description = tool.get("description", "No description provided.")
input_schema = tool.get("inputSchema", {}) or {}
properties = input_schema.get("properties", {}) or {}
required = set(input_schema.get("required", []) or [])
arg_lines = []
for prop_name, prop_details in properties.items():
details = json.dumps(prop_details, ensure_ascii=False, indent=2)
suffix = " (required)" if prop_name in required else ""
arg_lines.append(f"- {prop_name}{suffix}: {details}")
if arg_lines:
arguments_text = "\n".join(arg_lines)
else:
arguments_text = "(no arguments)"
descriptions.append(
f"Tool: {name}\nDescription: {description}\nArguments:\n{arguments_text}"
)
return "\n\n".join(descriptions) if descriptions else "(no tools available)"
def _normalize_content(self, content: Any) -> str:
if isinstance(content, str):
return content
if isinstance(content, list):
parts = []
for block in content:
if isinstance(block, dict):
if block.get("type") == "text":
parts.append(block.get("text", ""))
elif "text" in block:
parts.append(str(block.get("text")))
else:
parts.append(str(block))
return "\n".join(part for part in parts if part)
return json.dumps(content, ensure_ascii=False)
def _parse_react_response(self, payload: str) -> Dict[str, Any]:
candidate = payload.strip().strip("`").strip()
if candidate.lower().startswith("json"):
candidate = candidate[4:].lstrip()
try:
return json.loads(candidate)
except json.JSONDecodeError:
return {}
def _tool_result_to_text(self, result: Any) -> str:
if result is None:
return ""
if isinstance(result, str):
return result
try:
return json.dumps(result, ensure_ascii=False)
except TypeError:
return str(result)
================================================
FILE: src/agents/utils/__init__.py
================================================
"""
Utility functions for MCPMark Agent
====================================
"""
from .token_usage import TokenUsageTracker
__all__ = ["TokenUsageTracker"]
================================================
FILE: src/agents/utils/token_usage.py
================================================
"""
Token Usage Tracking Utilities
===============================
"""
from typing import Dict, Any
class TokenUsageTracker:
"""Track token usage across agent executions."""
def __init__(self):
"""Initialize token usage tracker."""
self.reset()
def reset(self):
"""Reset all usage statistics."""
self._stats = {
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_tokens": 0,
"total_turns": 0,
"total_execution_time": 0.0,
"successful_executions": 0,
"failed_executions": 0,
}
def update(self, success: bool, token_usage: Dict[str, int],
turn_count: int, execution_time: float):
"""
Update usage statistics.
Args:
success: Whether execution was successful
token_usage: Token usage dict with input_tokens, output_tokens, total_tokens
turn_count: Number of conversation turns
execution_time: Execution time in seconds
"""
if success:
self._stats["successful_executions"] += 1
else:
self._stats["failed_executions"] += 1
self._stats["total_input_tokens"] += token_usage.get("input_tokens", 0)
self._stats["total_output_tokens"] += token_usage.get("output_tokens", 0)
self._stats["total_tokens"] += token_usage.get("total_tokens", 0)
self._stats["total_turns"] += turn_count
self._stats["total_execution_time"] += execution_time
def get_stats(self) -> Dict[str, Any]:
"""
Get usage statistics with calculated averages.
Returns:
Dictionary containing usage statistics
"""
stats = self._stats.copy()
# Calculate averages
total_executions = stats["successful_executions"] + stats["failed_executions"]
if total_executions > 0:
stats["avg_input_tokens"] = stats["total_input_tokens"] / total_executions
stats["avg_output_tokens"] = stats["total_output_tokens"] / total_executions
stats["avg_total_tokens"] = stats["total_tokens"] / total_executions
stats["avg_turns"] = stats["total_turns"] / total_executions
stats["avg_execution_time"] = stats["total_execution_time"] / total_executions
stats["success_rate"] = (stats["successful_executions"] / total_executions * 100)
else:
stats.update({
"avg_input_tokens": 0.0,
"avg_output_tokens": 0.0,
"avg_total_tokens": 0.0,
"avg_turns": 0.0,
"avg_execution_time": 0.0,
"success_rate": 0.0,
})
return stats
================================================
FILE: src/aggregators/aggregate_results.py
================================================
#!/usr/bin/env python3
"""
Simplified MCPMark Results Aggregator
Aggregates evaluation results and generates summary with pass@k metrics.
"""
import json
import os
import argparse
import subprocess
import shutil
import tempfile
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Any, Tuple, Optional
from datetime import datetime
import sys
sys.path.append(str(Path(__file__).parent.parent.parent))
from src.errors import is_retryable_error
from src.aggregators.pricing import compute_cost_usd
# Supported difficulty splits in ./tasks///
SUPPORTED_TASK_SETS = {"standard", "easy"}
def discover_tasks(task_set: str = "standard") -> Dict[str, List[str]]:
"""Discover all tasks from ./tasks directory filtered by task set."""
tasks_dir = Path("./tasks")
all_tasks = {}
# Handle each MCP service
# Note: playwright and playwright_webarena both map to "playwright" MCP
service_mappings = {
"filesystem": ["filesystem"],
"github": ["github"],
"notion": ["notion"],
"playwright": ["playwright", "playwright_webarena"], # Both count as playwright
"postgres": ["postgres"], # supabase and insforge are variants with same tasks, don't merge
}
for mcp_service, task_dirs in service_mappings.items():
tasks: List[str] = []
for task_dir_name in task_dirs:
service_path = tasks_dir / task_dir_name
if not service_path.exists():
continue
selected_root = service_path / task_set
# Detect if this service has partitioned task sets (e.g. standard/easy)
has_partitioned_layout = any(
child.is_dir() and child.name in SUPPORTED_TASK_SETS
for child in service_path.iterdir()
)
if selected_root.exists():
search_roots = [selected_root]
elif has_partitioned_layout:
# Requested task set missing for this service; skip it for this run
print(f" ⚠️ No '{task_set}' tasks found under {service_path}")
search_roots = []
else:
# Legacy layout without task sets – fall back to original structure
search_roots = [service_path]
for root in search_roots:
for category_dir in root.iterdir():
if not category_dir.is_dir() or category_dir.name.startswith("__"):
continue
for task_dir in category_dir.iterdir():
if task_dir.is_dir() and not task_dir.name.startswith("__"):
tasks.append(f"{category_dir.name}__{task_dir.name}")
all_tasks[mcp_service] = sorted(tasks)
return all_tasks
def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]:
"""Collect all results from experiment directory."""
results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
# Current layout: results//__/run-N/__/
# Some pipelines include task-set suffix in service dir (e.g., "filesystem-easy").
# Normalize such names back to canonical service keys used by tasks/ (filesystem, github, notion, playwright, postgres).
def normalize_service_name(name: str) -> str:
# Strip known task-set suffixes like "-easy" or "-standard"
if name.endswith("-easy") or name.endswith("-standard"):
base = name.rsplit("-", 1)[0]
else:
base = name
# Map variant names to canonical service
if base == "playwright_webarena":
return "playwright"
return base
for model_service_dir in exp_dir.iterdir():
if not model_service_dir.is_dir() or "__" not in model_service_dir.name:
continue
model, service = model_service_dir.name.split("__", 1)
# Normalize service names
if service == "playwright_webarena":
service = "playwright"
elif service in ["supabase", "insforge"]:
service = "postgres"
for run_idx in range(1, k + 1):
run_dir = model_service_dir / f"run-{run_idx}"
if not run_dir.exists():
continue
for task_dir in run_dir.iterdir():
if not task_dir.is_dir() or "__" not in task_dir.name:
continue
meta_path = task_dir / "meta.json"
if meta_path.exists():
with open(meta_path) as f:
meta = json.load(f)
task_name = task_dir.name
results[model][service][f"run-{run_idx}"][task_name] = meta
return results
def check_completeness_and_validity(
results: Dict, all_tasks: Dict, k: int, single_run_models: List[str]
) -> Tuple[Dict, Dict, Dict]:
"""Check completeness and validity of results."""
complete_models = {}
incomplete_models = {}
invalid_models = {}
for model, model_results in results.items():
is_single_run = any(srm in model for srm in single_run_models)
required_runs = 1 if is_single_run else k
missing_info = []
invalid_info = []
# Check each service
for service, service_tasks in all_tasks.items():
if service not in model_results:
missing_info.append(f"Missing entire service: {service}")
continue
service_results = model_results[service]
# Check runs
for run_idx in range(1, required_runs + 1):
run_name = f"run-{run_idx}"
if run_name not in service_results:
missing_info.append(f"Missing {run_name} for {service}")
continue
run_results = service_results[run_name]
# Check tasks
missing_tasks = []
invalid_tasks = []
for task in service_tasks:
if task not in run_results:
missing_tasks.append(task)
else:
# Check for retryable errors only if the task did not succeed
meta = run_results[task]
success = bool(meta.get("execution_result", {}).get("success", False))
error_msg = meta.get("execution_result", {}).get("error_message", "")
if (not success) and error_msg and is_retryable_error(error_msg):
invalid_tasks.append(f"{task}: {error_msg[:50]}...")
if missing_tasks:
missing_info.append(f"{service}/{run_name}: missing {len(missing_tasks)} tasks")
if invalid_tasks:
invalid_info.extend([f"{service}/{run_name}/{t}" for t in invalid_tasks])
if missing_info:
incomplete_models[model] = missing_info
elif invalid_info:
invalid_models[model] = invalid_info
else:
complete_models[model] = model_results
return complete_models, incomplete_models, invalid_models
def calculate_metrics(complete_models: Dict, all_tasks: Dict, k: int, single_run_models: List[str]) -> Dict:
"""Calculate rich metrics (totals, averages, per-run aggregates, pass@k) for complete models."""
summary = {
"generated_at": datetime.now().isoformat(),
"k": k,
"overall": {},
}
# Initialize per-service sections mirroring overall structure
for service in all_tasks.keys():
summary[service] = {}
# Helper to safely extract token usage numbers
def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
tu = meta.get("token_usage", {}) or {}
input_tokens = int(tu.get("input_tokens", 0) or 0)
output_tokens = int(tu.get("output_tokens", 0) or 0)
total_tokens = int(tu.get("total_tokens", input_tokens + output_tokens) or (input_tokens + output_tokens))
return input_tokens, output_tokens, total_tokens
for model, model_results in complete_models.items():
is_single_run = any(srm in model for srm in single_run_models)
runs_count = 1 if is_single_run else k
total_tasks = sum(len(tasks) for tasks in all_tasks.values())
# Aggregates across all services and runs
total_agent_execution_time = 0.0
total_input_tokens = 0
total_output_tokens = 0
total_tokens = 0
total_turns = 0
# For optional fields
actual_model_name: Optional[str] = None
# If cost info is not present in metas, leave as None
per_run_cost: Optional[float] = None
# Model-level flags (to be inferred from meta.json)
is_open_source_model: Optional[bool] = None
is_reasoning_model: Optional[bool] = None
# For pass@1 per-run statistics across all services
pass1_rates_per_run_overall: List[float] = []
# For pass@k and pass^k across all services
pass_k_task_success_any = 0
pass_power_k_task_success_all = 0
# Precompute successes per task across runs for overall
# Also accumulate totals for tokens/time/turns
for run_idx in range(1, runs_count + 1):
run_name = f"run-{run_idx}"
successes_this_run = 0
for service, service_tasks in all_tasks.items():
# service-level aggregates for this model (will compute fully below)
for task in service_tasks:
meta = (
model_results
.get(service, {})
.get(run_name, {})
.get(task)
)
# In complete_models, meta should exist; still guard
if not meta:
continue
success = bool(meta.get("execution_result", {}).get("success", False))
if success:
successes_this_run += 1
# totals accumulation
total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0)
in_tok, out_tok, ttl_tok = get_token_counts(meta)
total_input_tokens += in_tok
total_output_tokens += out_tok
total_tokens += ttl_tok
total_turns += int(meta.get("turn_count", 0) or 0)
# capture actual model name if present
if actual_model_name is None:
actual_model_name = meta.get("actual_model_name") or None
# capture cost if present in any meta as per-run cost token (rare)
if per_run_cost is None:
# A few possible fields people use; if none present, stays None
possible_cost = meta.get("per_run_cost") or meta.get("run_cost") or meta.get("cost")
if isinstance(possible_cost, (int, float)):
per_run_cost = float(possible_cost)
# capture model flags if present
if is_open_source_model is None and "is_open_source_model" in meta:
is_open_source_model = bool(meta.get("is_open_source_model"))
if is_reasoning_model is None and "is_reasoning_model" in meta:
is_reasoning_model = bool(meta.get("is_reasoning_model"))
pass1_rates_per_run_overall.append(round(successes_this_run / total_tasks, 6))
# Compute pass@k and pass^k across tasks (overall)
if not is_single_run:
for service, service_tasks in all_tasks.items():
for task in service_tasks:
successes = []
for run_idx in range(1, runs_count + 1):
run_name = f"run-{run_idx}"
meta = (
model_results
.get(service, {})
.get(run_name, {})
.get(task)
)
success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False
successes.append(success)
if any(successes):
pass_k_task_success_any += 1
if all(successes):
pass_power_k_task_success_all += 1
# Build overall metrics entry
denom = total_tasks * runs_count if total_tasks > 0 else 1
avg_agent_execution_time = total_agent_execution_time / denom
avg_input_tokens = total_input_tokens / denom
avg_output_tokens = total_output_tokens / denom
avg_total_tokens = total_tokens / denom
avg_turns = total_turns / denom
# pass@1 stats across runs
if pass1_rates_per_run_overall:
avg_pass1 = sum(pass1_rates_per_run_overall) / len(pass1_rates_per_run_overall)
mean = avg_pass1
variance = (
sum((r - mean) ** 2 for r in pass1_rates_per_run_overall) / len(pass1_rates_per_run_overall)
)
std_pass1 = variance ** 0.5
else:
avg_pass1 = 0.0
std_pass1 = 0.0
# Compute per-run tokens and cost
per_run_input_tokens = total_input_tokens / runs_count if runs_count else 0
per_run_output_tokens = total_output_tokens / runs_count if runs_count else 0
model_for_pricing = actual_model_name or model
computed_per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens)
overall_metrics = {
"total_tasks": total_tasks,
"total_agent_execution_time": total_agent_execution_time,
"total_input_tokens": total_input_tokens,
"total_output_tokens": total_output_tokens,
"total_tokens": total_tokens,
"total_turns": total_turns,
"avg_agent_execution_time": round(avg_agent_execution_time, 4),
"avg_input_tokens": round(avg_input_tokens, 4),
"avg_output_tokens": round(avg_output_tokens, 4),
"avg_total_tokens": round(avg_total_tokens, 4),
"avg_turns": round(avg_turns, 4),
"per_run_input_tokens": per_run_input_tokens,
"per_run_output_tokens": per_run_output_tokens,
"per_run_cost": computed_per_run_cost if computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
"actual_model_name": actual_model_name or "",
"is_open_source_model": (is_open_source_model if is_open_source_model is not None else False),
"is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False),
"pass@1": {
"avg": round(avg_pass1, 4),
"std": round(std_pass1, 4),
},
}
if not is_single_run:
overall_metrics[f"pass@{k}"] = round(pass_k_task_success_any / total_tasks, 4)
overall_metrics[f"pass^{k}"] = round(pass_power_k_task_success_all / total_tasks, 4)
summary["overall"][model] = overall_metrics
# Per-service detailed metrics mirroring overall
for service, service_tasks in all_tasks.items():
service_total_tasks = len(service_tasks)
if service_total_tasks == 0:
continue
s_total_agent_execution_time = 0.0
s_total_input_tokens = 0
s_total_output_tokens = 0
s_total_tokens = 0
s_total_turns = 0
# per-run pass@1 for this service
s_pass1_rates_per_run: List[float] = []
# pass@k for this service
s_pass_k_task_success_any = 0
s_pass_power_k_task_success_all = 0
for run_idx in range(1, runs_count + 1):
run_name = f"run-{run_idx}"
s_successes_this_run = 0
for task in service_tasks:
meta = (
model_results
.get(service, {})
.get(run_name, {})
.get(task)
)
if not meta:
continue
success = bool(meta.get("execution_result", {}).get("success", False))
if success:
s_successes_this_run += 1
s_total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0)
in_tok, out_tok, ttl_tok = get_token_counts(meta)
s_total_input_tokens += in_tok
s_total_output_tokens += out_tok
s_total_tokens += ttl_tok
s_total_turns += int(meta.get("turn_count", 0) or 0)
s_pass1_rates_per_run.append(round(s_successes_this_run / service_total_tasks, 6))
if not is_single_run:
for task in service_tasks:
successes = []
for run_idx in range(1, runs_count + 1):
run_name = f"run-{run_idx}"
meta = (
model_results
.get(service, {})
.get(run_name, {})
.get(task)
)
success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False
successes.append(success)
if any(successes):
s_pass_k_task_success_any += 1
if all(successes):
s_pass_power_k_task_success_all += 1
s_denom = service_total_tasks * runs_count if service_total_tasks > 0 else 1
s_avg_agent_execution_time = s_total_agent_execution_time / s_denom
s_avg_input_tokens = s_total_input_tokens / s_denom
s_avg_output_tokens = s_total_output_tokens / s_denom
s_avg_total_tokens = s_total_tokens / s_denom
s_avg_turns = s_total_turns / s_denom
if s_pass1_rates_per_run:
s_mean = sum(s_pass1_rates_per_run) / len(s_pass1_rates_per_run)
s_var = sum((r - s_mean) ** 2 for r in s_pass1_rates_per_run) / len(s_pass1_rates_per_run)
s_std = s_var ** 0.5
else:
s_mean = 0.0
s_std = 0.0
# Compute per-run tokens and cost for this service
s_per_run_input_tokens = s_total_input_tokens / runs_count if runs_count else 0
s_per_run_output_tokens = s_total_output_tokens / runs_count if runs_count else 0
s_computed_per_run_cost = compute_cost_usd(model_for_pricing, s_per_run_input_tokens, s_per_run_output_tokens)
service_metrics = {
"total_tasks": service_total_tasks,
"total_agent_execution_time": s_total_agent_execution_time,
"total_input_tokens": s_total_input_tokens,
"total_output_tokens": s_total_output_tokens,
"total_tokens": s_total_tokens,
"total_turns": s_total_turns,
"avg_agent_execution_time": round(s_avg_agent_execution_time, 4),
"avg_input_tokens": round(s_avg_input_tokens, 4),
"avg_output_tokens": round(s_avg_output_tokens, 4),
"avg_total_tokens": round(s_avg_total_tokens, 4),
"avg_turns": round(s_avg_turns, 4),
"per_run_input_tokens": s_per_run_input_tokens,
"per_run_output_tokens": s_per_run_output_tokens,
"per_run_cost": s_computed_per_run_cost if s_computed_per_run_cost is not None else (per_run_cost if per_run_cost is not None else None),
"actual_model_name": actual_model_name or "",
"is_open_source_model": (is_open_source_model if is_open_source_model is not None else False),
"is_reasoning_model": (is_reasoning_model if is_reasoning_model is not None else False),
"pass@1": {
"avg": round(s_mean, 4),
"std": round(s_std, 4),
},
}
if not is_single_run:
service_metrics[f"pass@{k}"] = round(s_pass_k_task_success_any / service_total_tasks, 4)
service_metrics[f"pass^{k}"] = round(s_pass_power_k_task_success_all / service_total_tasks, 4)
summary[service][model] = service_metrics
return summary
def generate_model_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict):
"""Generate model_results directory."""
model_results_dir = exp_dir / "model_results"
if model_results_dir.exists():
shutil.rmtree(model_results_dir)
model_results_dir.mkdir()
for model, model_data in complete_models.items():
model_dir = model_results_dir / model
model_dir.mkdir()
# Create a file for each task
for service, service_tasks in all_tasks.items():
if service not in model_data:
continue
for task in service_tasks:
task_data = {
"model": model,
"service": service,
"task": task,
"runs": {}
}
# Collect data from all runs
for run_name, run_data in model_data[service].items():
if task in run_data:
meta = run_data[task]
task_data["runs"][run_name] = {
"success": meta.get("execution_result", {}).get("success", False),
"error_message": meta.get("execution_result", {}).get("error_message"),
"execution_time": meta.get("agent_execution_time", 0),
"token_usage": meta.get("token_usage", {}),
"turn_count": meta.get("turn_count", 0)
}
# Save task file
task_file = model_dir / f"{task}.json"
with open(task_file, "w") as f:
json.dump(task_data, f, indent=2)
def generate_task_results(exp_dir: Path, complete_models: Dict, all_tasks: Dict):
"""Generate task_results directory."""
task_results_dir = exp_dir / "task_results"
if task_results_dir.exists():
shutil.rmtree(task_results_dir)
task_results_dir.mkdir()
# For each task, collect results across all models
for service, service_tasks in all_tasks.items():
for task in service_tasks:
task_data = {
"task": task,
"service": service,
"models": {}
}
for model, model_data in complete_models.items():
if service not in model_data:
continue
model_task_data = {"runs": []}
for run_name, run_data in model_data[service].items():
if task in run_data:
meta = run_data[task]
agent_time = float(meta.get("agent_execution_time", 0.0) or 0.0)
token_usage = meta.get("token_usage", {}) or {}
turn_count = int(meta.get("turn_count", 0) or 0)
success = bool(meta.get("execution_result", {}).get("success", False))
model_task_data["runs"].append({
"run": run_name,
"success": success,
"execution_time": agent_time,
"agent_execution_time": agent_time,
"token_usage": token_usage,
"turn_count": turn_count,
})
if model_task_data["runs"]:
# Compute per-model summary across runs for this task
runs_list = model_task_data["runs"]
runs_count = len(runs_list)
successful_runs = sum(1 for r in runs_list if r.get("success"))
# Averages
total_agent_time = sum(float(r.get("agent_execution_time", r.get("execution_time", 0.0)) or 0.0) for r in runs_list)
avg_agent_time = round(total_agent_time / runs_count, 2)
def _tok(r, key):
tu = r.get("token_usage") or {}
return int(tu.get(key, 0) or 0)
total_input_tokens = 0
total_output_tokens = 0
total_total_tokens = 0
for r in runs_list:
in_tok = _tok(r, "input_tokens")
out_tok = _tok(r, "output_tokens")
ttl_tok = int((r.get("token_usage") or {}).get("total_tokens", in_tok + out_tok) or (in_tok + out_tok))
total_input_tokens += in_tok
total_output_tokens += out_tok
total_total_tokens += ttl_tok
avg_input_tokens = round(total_input_tokens / runs_count, 1)
avg_output_tokens = round(total_output_tokens / runs_count, 1)
avg_total_tokens = round(total_total_tokens / runs_count, 1)
total_turns = sum(int(r.get("turn_count", 0) or 0) for r in runs_list)
avg_turn_count = round(total_turns / runs_count, 2)
summary_obj = {
"total_runs": runs_count,
"successful_runs": successful_runs,
"avg_agent_execution_time": avg_agent_time,
"avg_input_tokens": avg_input_tokens,
"avg_output_tokens": avg_output_tokens,
"avg_total_tokens": avg_total_tokens,
"avg_turn_count": avg_turn_count,
}
# Include pass@k and pass^k only for multi-run models
if runs_count > 1:
summary_obj[f"pass@{runs_count}"] = 1.0 if successful_runs > 0 else 0.0
summary_obj[f"pass^{runs_count}"] = 1.0 if successful_runs == runs_count else 0.0
model_task_data["summary"] = summary_obj
task_data["models"][model] = model_task_data
# Save task file
task_file = task_results_dir / f"{task}.json"
with open(task_file, "w") as f:
json.dump(task_data, f, indent=2)
def generate_readme(exp_name: str, summary: Dict, k: int) -> str:
"""Generate README.md content with six tables: overall + 5 MCP services.
Each table includes Total Tasks, Pass@1 (avg ± std), Avg Agent Time (s), and Pass@k/Pass^k (if k > 1).
"""
def get_pass1_avg_std(metrics: Dict[str, Any]) -> Tuple[float, float]:
p1 = metrics.get("pass@1")
if isinstance(p1, dict):
return float(p1.get("avg", 0.0) or 0.0), float(p1.get("std", 0.0) or 0.0)
# Back-compat if older summaries exist
return float(p1 or 0.0), 0.0
def render_section(title: str, section_data: Dict[str, Any]) -> List[str]:
lines_sec: List[str] = [
f"## {title}",
"",
]
header = "| Model | Total Tasks | Pass@1 (avg ± std) |"
sep = "|-------|-------------|--------------------|"
# include pass@k headers if present (k>1)
include_k = k > 1
if include_k:
header += f" Pass@{k} | Pass^{k} |"
sep += "----------|----------|"
# Add Per-Run Cost (USD) and Avg Agent Time (s) at the end
header += " Per-Run Cost (USD) |"
sep += "---------------------|"
header += " Avg Agent Time (s) |"
sep += "--------------------|"
lines_sec.append(header)
lines_sec.append(sep)
# Sort by Pass@1 avg
sorted_items = sorted(
section_data.items(),
key=lambda x: get_pass1_avg_std(x[1])[0],
reverse=True
)
for model, metrics in sorted_items:
pass1_avg, pass1_std = get_pass1_avg_std(metrics)
avg_time = float(metrics.get("avg_agent_execution_time", 0.0) or 0.0)
# Format per-run cost (up to 2 decimal places, trim trailing zeros)
cost_val = metrics.get("per_run_cost")
if isinstance(cost_val, (int, float)):
rounded_cost = round(float(cost_val), 2)
formatted_cost = f"{rounded_cost:.2f}".rstrip('0').rstrip('.')
cost_str = f"${formatted_cost}"
else:
cost_str = "/"
row = (
f"| {model} | {metrics.get('total_tasks', 0)} | "
f"{pass1_avg * 100:.1f}% ± {pass1_std * 100:.1f}% |"
)
if include_k:
if f"pass@{k}" in metrics and f"pass^{k}" in metrics:
row += f" {metrics[f'pass@{k}'] * 100:.1f}% | {metrics[f'pass^{k}'] * 100:.1f}% |"
else:
# Single-run models do not have pass@k or pass^k; show placeholders
row += " / | / |"
# Append cost and avg agent time at the end
row += f" {cost_str} |"
row += f" {avg_time:.1f} |"
lines_sec.append(row)
lines_sec.append("")
return lines_sec
lines: List[str] = [
f"# {exp_name} - Evaluation Results",
"",
f"Generated: {summary['generated_at']}",
]
task_set = summary.get("task_set")
if task_set:
lines.append(f"Task set: {task_set}")
lines.append("")
# Overall table
lines.extend(render_section("Overall Performance", summary.get("overall", {})))
# Service tables: infer service keys from summary
reserved = {"overall", "generated_at", "k", "experiment_name", "task_set"}
service_keys = [key for key in summary.keys() if key not in reserved]
# Keep stable order
for service in sorted(service_keys):
title = f"{service.capitalize()} Performance"
lines.extend(render_section(title, summary.get(service, {})))
return "\n".join(lines)
def push_to_github(exp_dir: Path, exp_name: str, branch: Optional[str] = None):
"""Push results to GitHub repository."""
try:
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
print("📥 Cloning experiments repository...")
subprocess.run([
"git", "clone",
"git@github.com:eval-sys/mcpmark-experiments.git",
str(temp_path)
], check=True, capture_output=True)
# Copy files
for item in ["summary.json", "README.md", "model_results", "task_results"]:
src = exp_dir / item
if src.exists():
dst = temp_path / item
if src.is_dir():
if dst.exists():
shutil.rmtree(dst)
shutil.copytree(src, dst)
else:
shutil.copy2(src, dst)
print(f" 📄 {item}")
# Git operations
os.chdir(temp_path)
# If a branch is specified, create/checkout it before staging changes. Otherwise, ensure main.
if branch:
try:
subprocess.run(["git", "fetch", "origin"], check=True)
except subprocess.CalledProcessError:
# Non-fatal if fetch fails in some environments
pass
subprocess.run(["git", "checkout", "-B", branch], check=True)
print(f" 🌿 Using branch '{branch}'")
else:
# Default to main branch
try:
subprocess.run(["git", "fetch", "origin"], check=True)
except subprocess.CalledProcessError:
pass
# Prefer main; if it doesn't exist locally, create tracking from origin/main
result = subprocess.run(["git", "rev-parse", "--verify", "main"], capture_output=True)
if result.returncode != 0:
# Try to checkout origin/main
try:
subprocess.run(["git", "checkout", "-B", "main", "origin/main"], check=True)
except subprocess.CalledProcessError:
# Fallback: create main if no origin/main
subprocess.run(["git", "checkout", "-B", "main"], check=True)
else:
subprocess.run(["git", "checkout", "main"], check=True)
subprocess.run(["git", "add", "."], check=True)
# Check for changes
result = subprocess.run(
["git", "diff", "--staged", "--name-only"],
capture_output=True, text=True
)
if not result.stdout.strip():
print("✅ No changes to push")
return True
# Commit and push
subprocess.run([
"git", "commit", "-m", f"Update results for {exp_name}"
], check=True)
if branch:
subprocess.run(["git", "push", "--set-upstream", "origin", branch], check=True)
else:
subprocess.run(["git", "push", "--set-upstream", "origin", "main"], check=True)
print("✅ Successfully pushed to GitHub")
return True
except subprocess.CalledProcessError as e:
print(f"❌ Git operation failed: {e}")
return False
def print_validation_report(complete: Dict, incomplete: Dict, invalid: Dict, all_tasks: Dict, k: int, single_run_models: List[str], raw_results: Dict):
"""Print structured validation report with summary table."""
# Combine all models
all_models = {}
for model in complete:
all_models[model] = {"status": "complete", "data": complete[model]}
for model in incomplete:
all_models[model] = {"status": "incomplete", "issues": incomplete[model]}
for model in invalid:
all_models[model] = {"status": "invalid", "issues": invalid[model]}
# Calculate expected counts
total_expected_tasks = sum(len(tasks) for tasks in all_tasks.values())
# Summary table
print("\n" + "=" * 100)
print("COMPLETENESS SUMMARY TABLE")
print("=" * 100)
print()
print(f"{'Model':<30} {'Expected':<12} {'Actual':<12} {'Missing':<12} {'Status':<30}")
print("-" * 100)
sorted_models = sorted(all_models.keys())
for model_name in sorted_models:
model_info = all_models[model_name]
# Determine expected runs and tasks
is_single_run = any(srm in model_name for srm in single_run_models)
expected_runs = 1 if is_single_run else k
expected_total = total_expected_tasks * expected_runs
if model_info["status"] == "complete":
# Count actual tasks from complete model data
actual_total = 0
for service, service_data in model_info["data"].items():
for run_name, run_data in service_data.items():
actual_total += len(run_data)
missing = 0
status = "✅ Complete"
else:
# For incomplete/invalid models, count from raw results
actual_total = 0
if model_name in raw_results:
for service, service_data in raw_results[model_name].items():
for run_name, run_data in service_data.items():
actual_total += len(run_data)
missing = expected_total - actual_total
if model_info["status"] == "incomplete":
# Find which services have issues
problem_services = set()
for issue in model_info["issues"]:
if "Missing entire service:" in issue:
service = issue.split(": ")[1]
problem_services.add(service)
elif "/" in issue:
service = issue.split("/")[0]
problem_services.add(service)
elif "Missing run" in issue:
service = issue.split(" for ")[1]
problem_services.add(service)
if problem_services:
services_str = ", ".join(sorted(problem_services))
status = f"❌ Incomplete ({services_str})"
else:
status = "❌ Incomplete"
else: # invalid
status = "⚠️ Invalid (retryable errors)"
# Format the row
print(f"{model_name:<30} {expected_total:<12} {actual_total:<12} {missing:<12} {status:<30}")
print()
# Overall statistics
complete_count = len(complete)
incomplete_count = len(incomplete)
invalid_count = len(invalid)
total_models = complete_count + incomplete_count + invalid_count
print("=" * 100)
print("OVERALL STATISTICS")
print("=" * 100)
print(f"Total models analyzed: {total_models}")
print(f"Complete models: {complete_count}")
print(f"Incomplete models: {incomplete_count}")
print(f"Invalid models (with retryable errors): {invalid_count}")
print(f"Total tasks per MCP: {total_expected_tasks}")
print(f"Expected runs (k): {k}")
if not complete:
print("\n❌ No models have complete and valid results!")
else:
print(f"\n✅ {complete_count} model(s) ready for aggregation: {', '.join(sorted(complete.keys()))}")
def main():
# Extra parser for push-related options
push_parent = argparse.ArgumentParser(add_help=False)
push_parent.add_argument(
"--branch",
type=str,
help="If provided with --push, push to this new branch"
)
parser = argparse.ArgumentParser(
description="Simplified MCPMark results aggregator"
, parents=[push_parent])
parser.add_argument("--exp-name", required=True, help="Experiment name")
parser.add_argument("--k", type=int, default=4, help="Number of runs (default: 4)")
parser.add_argument(
"--single-run-models",
type=str,
help="Comma-separated list of models that only need run-1"
)
parser.add_argument(
"--task-set",
choices=sorted(SUPPORTED_TASK_SETS),
default="standard",
help="Which task subset to aggregate (default: standard)"
)
parser.add_argument("--push", action="store_true", help="Push to GitHub (default to main)")
args = parser.parse_args()
# Parse single-run models
single_run_models = []
if args.single_run_models:
single_run_models = [m.strip() for m in args.single_run_models.split(",")]
print(f"📌 Single-run models: {', '.join(single_run_models)}")
# Setup paths
exp_dir = Path("./results") / args.exp_name
if not exp_dir.exists():
print(f"❌ Experiment directory {exp_dir} does not exist")
return 1
print(f"🔄 Processing experiment: {args.exp_name}")
# Discover all tasks
print(f"📋 Discovering tasks (task set: {args.task_set})...")
all_tasks = discover_tasks(args.task_set)
total_tasks = sum(len(tasks) for tasks in all_tasks.values())
print(f" Found {total_tasks} tasks across {len(all_tasks)} services")
print("📥 Collecting results...")
results = collect_results(exp_dir, args.k)
print(f" Found results for {len(results)} models")
# Check completeness and validity
print("✓ Checking completeness and validity...")
complete_models, incomplete_models, invalid_models = check_completeness_and_validity(
results, all_tasks, args.k, single_run_models
)
# Print validation report with summary table
print_validation_report(complete_models, incomplete_models, invalid_models,
all_tasks, args.k, single_run_models, results)
# Determine which models to include in output (strict: only complete models)
models_for_output = dict(complete_models)
if not models_for_output:
return 1
# Calculate metrics
print("\n📊 Calculating metrics...")
summary = calculate_metrics(models_for_output, all_tasks, args.k, single_run_models)
summary["experiment_name"] = args.exp_name
summary["task_set"] = args.task_set
# Save summary
summary_path = exp_dir / "summary.json"
with open(summary_path, "w") as f:
json.dump(summary, f, indent=2)
print(f" 📄 Saved summary.json")
# Generate model_results
print("📁 Generating model_results...")
generate_model_results(exp_dir, models_for_output, all_tasks)
print(f" Created {len(models_for_output)} model directories")
# Generate task_results
print("📁 Generating task_results...")
generate_task_results(exp_dir, models_for_output, all_tasks)
print(f" Created {total_tasks} task files")
# Generate README
readme_content = generate_readme(args.exp_name, summary, args.k)
readme_path = exp_dir / "README.md"
with open(readme_path, "w") as f:
f.write(readme_content)
print(" 📄 Generated README.md")
# Push to GitHub if requested
if args.push:
print("\n🚀 Pushing to GitHub...")
push_to_github(exp_dir, args.exp_name, branch=args.branch)
print(f"\n🎉 Successfully processed {args.exp_name}")
return 0
if __name__ == "__main__":
exit(main())
================================================
FILE: src/aggregators/aggregate_specific_results.py
================================================
#!/usr/bin/env python3
"""
Simple Results Aggregator - Aggregate specific result directories
Usage: python -m src.aggregators.aggregate_specific_results --result-dir results/exp/model__service --k 4
"""
import json
import argparse
from pathlib import Path
from collections import defaultdict
from typing import Dict, Any, Tuple, List
from datetime import datetime
import sys
sys.path.append(str(Path(__file__).parent.parent.parent))
from src.aggregators.pricing import compute_cost_usd
def collect_results_from_dir(result_dir: Path, k: int) -> Dict[str, Any]:
"""Collect all results from a specific result directory."""
results = {}
for run_idx in range(1, k + 1):
run_dir = result_dir / f"run-{run_idx}"
if not run_dir.exists():
print(f"⚠️ Warning: {run_dir} does not exist, skipping")
continue
run_results = {}
for task_dir in run_dir.iterdir():
if not task_dir.is_dir():
continue
meta_path = task_dir / "meta.json"
if meta_path.exists():
with open(meta_path) as f:
meta = json.load(f)
run_results[task_dir.name] = meta
results[f"run-{run_idx}"] = run_results
return results
def get_token_counts(meta: Dict[str, Any]) -> Tuple[int, int, int]:
"""Extract token counts from meta."""
tu = meta.get("token_usage", {}) or {}
input_tokens = int(tu.get("input_tokens", 0) or 0)
output_tokens = int(tu.get("output_tokens", 0) or 0)
total_tokens = int(tu.get("total_tokens", input_tokens + output_tokens) or (input_tokens + output_tokens))
return input_tokens, output_tokens, total_tokens
def calculate_metrics(results: Dict, k: int, model_name: str) -> Dict:
"""Calculate metrics from results."""
# Get all unique task names
all_tasks = set()
for run_name, run_data in results.items():
all_tasks.update(run_data.keys())
all_tasks = sorted(all_tasks)
total_tasks = len(all_tasks)
actual_runs = len(results)
print(f"\n📊 Analysis:")
print(f" Total unique tasks: {total_tasks}")
print(f" Runs found: {actual_runs} (expected: {k})")
# Aggregates
total_agent_execution_time = 0.0
total_input_tokens = 0
total_output_tokens = 0
total_tokens = 0
total_turns = 0
actual_model_name = None
# Per-run pass@1
pass1_rates_per_run = []
# For pass@k
pass_k_task_success_any = 0
pass_power_k_task_success_all = 0
for run_idx in range(1, actual_runs + 1):
run_name = f"run-{run_idx}"
successes_this_run = 0
for task in all_tasks:
meta = results.get(run_name, {}).get(task)
if not meta:
continue
success = bool(meta.get("execution_result", {}).get("success", False))
if success:
successes_this_run += 1
total_agent_execution_time += float(meta.get("agent_execution_time", 0.0) or 0.0)
in_tok, out_tok, ttl_tok = get_token_counts(meta)
total_input_tokens += in_tok
total_output_tokens += out_tok
total_tokens += ttl_tok
total_turns += int(meta.get("turn_count", 0) or 0)
if actual_model_name is None:
actual_model_name = meta.get("actual_model_name") or None
pass1_rate = successes_this_run / total_tasks if total_tasks > 0 else 0
pass1_rates_per_run.append(pass1_rate)
print(f" Run {run_idx}: {successes_this_run}/{total_tasks} = {pass1_rate*100:.1f}%")
# Calculate pass@k
for task in all_tasks:
successes = []
for run_idx in range(1, actual_runs + 1):
run_name = f"run-{run_idx}"
meta = results.get(run_name, {}).get(task)
success = bool(meta.get("execution_result", {}).get("success", False)) if meta else False
successes.append(success)
if any(successes):
pass_k_task_success_any += 1
if all(successes):
pass_power_k_task_success_all += 1
# Averages
denom = total_tasks * actual_runs if total_tasks > 0 else 1
avg_agent_execution_time = total_agent_execution_time / denom
avg_input_tokens = total_input_tokens / denom
avg_output_tokens = total_output_tokens / denom
avg_total_tokens = total_tokens / denom
avg_turns = total_turns / denom
# Pass@1 stats
if pass1_rates_per_run:
avg_pass1 = sum(pass1_rates_per_run) / len(pass1_rates_per_run)
mean = avg_pass1
variance = sum((r - mean) ** 2 for r in pass1_rates_per_run) / len(pass1_rates_per_run)
std_pass1 = variance ** 0.5
else:
avg_pass1 = 0.0
std_pass1 = 0.0
# Cost calculation
per_run_input_tokens = total_input_tokens / actual_runs if actual_runs else 0
per_run_output_tokens = total_output_tokens / actual_runs if actual_runs else 0
model_for_pricing = actual_model_name or model_name
per_run_cost = compute_cost_usd(model_for_pricing, per_run_input_tokens, per_run_output_tokens)
summary = {
"generated_at": datetime.now().isoformat(),
"model": model_name,
"actual_model_name": actual_model_name or model_name,
"runs": actual_runs,
"total_tasks": total_tasks,
"total_agent_execution_time": round(total_agent_execution_time, 2),
"total_input_tokens": total_input_tokens,
"total_output_tokens": total_output_tokens,
"total_tokens": total_tokens,
"total_turns": total_turns,
"avg_agent_execution_time": round(avg_agent_execution_time, 4),
"avg_input_tokens": round(avg_input_tokens, 2),
"avg_output_tokens": round(avg_output_tokens, 2),
"avg_total_tokens": round(avg_total_tokens, 2),
"avg_turns": round(avg_turns, 2),
"per_run_input_tokens": round(per_run_input_tokens, 2),
"per_run_output_tokens": round(per_run_output_tokens, 2),
"per_run_cost": round(per_run_cost, 4) if per_run_cost else None,
"pass@1": {
"avg": round(avg_pass1, 4),
"std": round(std_pass1, 4),
"per_run": [round(r, 4) for r in pass1_rates_per_run]
},
}
if actual_runs > 1:
summary[f"pass@{actual_runs}"] = round(pass_k_task_success_any / total_tasks, 4)
summary[f"pass^{actual_runs}"] = round(pass_power_k_task_success_all / total_tasks, 4)
return summary
def main():
parser = argparse.ArgumentParser(description="Simple results aggregator for specific directories")
parser.add_argument("--result-dir", required=True, help="Path to result directory (e.g., results/exp/model__service)")
parser.add_argument("--k", type=int, default=4, help="Number of runs (default: 4)")
parser.add_argument("--output", help="Output JSON file path (default: /summary.json)")
args = parser.parse_args()
result_dir = Path(args.result_dir)
if not result_dir.exists():
print(f"❌ Result directory {result_dir} does not exist")
return 1
# Extract model name from directory name
model_name = result_dir.name.replace("__", "-")
print(f"🔄 Processing: {result_dir}")
print(f"📋 Model: {model_name}")
# Collect results
results = collect_results_from_dir(result_dir, args.k)
if not results:
print("❌ No results found")
return 1
# Calculate metrics
summary = calculate_metrics(results, args.k, model_name)
# Save summary
output_path = Path(args.output) if args.output else result_dir / "summary.json"
with open(output_path, "w") as f:
json.dump(summary, f, indent=2)
print(f"\n✅ Summary saved to: {output_path}")
print(f"\n📈 Results:")
print(f" Pass@1: {summary['pass@1']['avg']*100:.1f}% ± {summary['pass@1']['std']*100:.1f}%")
if f"pass@{args.k}" in summary:
print(f" Pass@{args.k}: {summary[f'pass@{args.k}']*100:.1f}%")
print(f" Pass^{args.k}: {summary[f'pass^{args.k}']*100:.1f}%")
print(f" Per-run cost: ${summary['per_run_cost']:.4f}" if summary['per_run_cost'] else " Per-run cost: N/A")
print(f" Avg agent time: {summary['avg_agent_execution_time']:.2f}s")
print(f" Avg turns: {summary['avg_turns']:.2f}")
print(f"\n📊 Token Usage:")
avg_tokens_per_run = summary['total_tokens'] / summary['runs'] if summary['runs'] > 0 else 0
print(f" Avg tokens per run: {avg_tokens_per_run:,.0f}")
print(f" Avg tokens per turn: {summary['avg_total_tokens'] / summary['avg_turns']:.0f}" if summary['avg_turns'] > 0 else " Avg tokens per turn: N/A")
print(f" Total tokens (all runs): {summary['total_tokens']:,}")
print(f" Total turns (all runs): {summary['total_turns']:,}")
return 0
if __name__ == "__main__":
exit(main())
================================================
FILE: src/aggregators/aggregate_task_meta.py
================================================
#!/usr/bin/env python3
"""
Task Meta Aggregator for MCPBench
Aggregates all meta.json files from the tasks directory into a single JSON file.
"""
import json
import os
import argparse
import subprocess
import shutil
from pathlib import Path
from typing import Dict, List, Any, Set
def find_all_meta_files(tasks_root: Path = Path("tasks")) -> List[Path]:
"""Find all meta.json files in the tasks directory"""
meta_files = []
for root, dirs, files in os.walk(tasks_root):
if "meta.json" in files:
meta_files.append(Path(root) / "meta.json")
return meta_files
def parse_meta_file(meta_path: Path) -> Dict[str, Any]:
"""Parse a single meta.json file"""
try:
with open(meta_path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
print(f"Error parsing {meta_path}: {e}")
return {}
def aggregate_task_meta(meta_files: List[Path]) -> Dict[str, Any]:
"""Aggregate all meta.json files into the required structure"""
all_data = []
categories_dict = {} # Use dict to track unique categories
all_tags_set = set() # Set to collect all unique tags
for meta_path in meta_files:
meta_data = parse_meta_file(meta_path)
if meta_data:
# Exclude model_results field from aggregated data
filtered_data = {k: v for k, v in meta_data.items() if k != "model_results"}
all_data.append(filtered_data)
# Collect categories using category_id and category_name
if "category_id" in filtered_data and "category_name" in filtered_data:
category_id = filtered_data["category_id"]
category_name = filtered_data["category_name"]
# Use category_id as the key to ensure uniqueness
categories_dict[category_id] = {
"id": category_id,
"name": category_name,
}
# Collect all unique tags
if "tags" in filtered_data and isinstance(filtered_data["tags"], list):
all_tags_set.update(filtered_data["tags"])
# Convert categories dict to sorted list
categories_list = sorted(categories_dict.values(), key=lambda x: x["id"])
# Convert tags set to sorted list
all_tags_list = sorted(all_tags_set)
return {
"data": all_data,
"count": len(all_data),
"categories": categories_list,
"tags": all_tags_list,
}
def create_individual_task_files(meta_files: List[Path]) -> List[Dict[str, Any]]:
"""Create individual task JSON files with instruction and verify content"""
task_files = []
for meta_path in meta_files:
meta_data = parse_meta_file(meta_path)
if not meta_data or "task_id" not in meta_data:
continue
# Get the task directory
task_dir = meta_path.parent
# Read description.md if exists
description_path = task_dir / "description.md"
instruction_content = ""
if description_path.exists():
try:
with open(description_path, "r", encoding="utf-8") as f:
instruction_content = f.read()
except Exception as e:
print(f"Warning: Could not read {description_path}: {e}")
# Read verify.py if exists
verify_path = task_dir / "verify.py"
verify_content = ""
if verify_path.exists():
try:
with open(verify_path, "r", encoding="utf-8") as f:
verify_content = f.read()
except Exception as e:
print(f"Warning: Could not read {verify_path}: {e}")
# Create combined task data, excluding model_results
task_data = {
k: v for k, v in meta_data.items() if k != "model_results"
}
task_data["instruction"] = instruction_content
task_data["verify"] = verify_content
task_files.append({"filename": f"{meta_data['task_id']}.json", "data": task_data})
return task_files
def push_to_file(
output_file: Path,
data: Dict[str, Any],
task_files: List[Dict[str, Any]] = None,
push_to_repo: bool = False,
) -> bool:
"""Save the aggregated data to file and optionally push to repo"""
try:
# Create parent directory if it doesn't exist
output_file.parent.mkdir(parents=True, exist_ok=True)
# Write the aggregated data
with open(output_file, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
print(f"✅ Task meta data saved to: {output_file}")
print(f"📊 Summary:")
print(f" - Total tasks with meta.json: {data['count']}")
print(f" - Categories: {len(data['categories'])}")
print(f" - Unique tags: {len(data['tags'])}")
if push_to_repo:
return push_to_experiments_repo(output_file, task_files)
return True
except Exception as e:
print(f"❌ Error saving file: {e}")
return False
def push_to_experiments_repo(
file_path: Path, task_files: List[Dict[str, Any]] = None
) -> bool:
"""Push the task meta file and individual task files to eval-sys/mcpmark-experiments repo"""
if not file_path.exists():
print("⚠️ File does not exist")
return False
repo_url = "https://github.com/eval-sys/mcpmark-experiments.git"
temp_dir = Path("./temp_experiments_repo")
try:
print(f"\n🔄 Preparing to push task meta to experiments repo...")
# Clean up any existing temp directory
if temp_dir.exists():
shutil.rmtree(temp_dir)
# Clone the repo
print("📥 Cloning experiments repo...")
subprocess.run(
["git", "clone", repo_url, str(temp_dir)], check=True, capture_output=True
)
# Copy the main task_meta.json file
target_path = temp_dir / "task_meta.json"
print(f"📁 Copying task meta file: task_meta.json")
shutil.copy2(file_path, target_path)
# Create tasks directory and copy individual task files
if task_files:
tasks_dir = temp_dir / "tasks"
tasks_dir.mkdir(exist_ok=True)
print(f"📁 Creating individual task files in ./tasks directory...")
for task_file in task_files:
task_file_path = tasks_dir / task_file["filename"]
with open(task_file_path, "w", encoding="utf-8") as f:
json.dump(task_file["data"], f, indent=2, ensure_ascii=False)
print(f" - Created {len(task_files)} individual task files")
# Change to repo directory for git operations
original_dir = os.getcwd()
os.chdir(temp_dir)
# Add all changes
subprocess.run(["git", "add", "."], check=True)
# Check if there are changes to commit
result = subprocess.run(
["git", "status", "--porcelain"], capture_output=True, text=True
)
if not result.stdout.strip():
print("✅ No changes to push (files are up to date)")
return True
# Commit changes
commit_msg = "Update task meta data and individual task files"
subprocess.run(["git", "commit", "-m", commit_msg], check=True)
# Push changes
print("🚀 Pushing to remote repository...")
subprocess.run(["git", "push"], check=True)
print("✅ Successfully pushed task meta and individual task files to repo!")
return True
except subprocess.CalledProcessError as e:
print(f"❌ Git operation failed: {e}")
return False
except Exception as e:
print(f"❌ Error pushing to repo: {e}")
return False
finally:
# Change back to original directory
os.chdir(original_dir)
# Clean up temp directory
if temp_dir.exists():
shutil.rmtree(temp_dir)
def main():
parser = argparse.ArgumentParser(description="Aggregate all task meta.json files")
parser.add_argument(
"--output",
type=str,
default="task_meta.json",
help="Output file path (default: task_meta.json)",
)
parser.add_argument(
"--push",
action="store_true",
help="Push results to eval-sys/mcpmark-experiments repo",
)
args = parser.parse_args()
print("🔍 Searching for meta.json files in tasks directory...")
# Find all meta.json files
meta_files = find_all_meta_files()
if not meta_files:
print("❌ No meta.json files found in tasks directory")
return 1
print(f"📁 Found {len(meta_files)} meta.json files")
# Aggregate the data
print("🔄 Aggregating task meta data...")
aggregated_data = aggregate_task_meta(meta_files)
# Create individual task files if pushing to repo
task_files = None
if args.push:
print("🔄 Creating individual task files...")
task_files = create_individual_task_files(meta_files)
print(f"📝 Prepared {len(task_files)} individual task files")
# Save to file
output_path = Path(args.output)
success = push_to_file(output_path, aggregated_data, task_files, args.push)
if not success:
return 1
if args.push:
print(
f"🚀 Task meta data and individual task files pushed to eval-sys/mcpmark-experiments repo"
)
return 0
if __name__ == "__main__":
exit(main())
================================================
FILE: src/aggregators/pricing.py
================================================
"""
Pricing utilities for computing per-run cost from token usage.
All prices are specified per 1,000,000 tokens (M tokens) in USD.
"""
from __future__ import annotations
from typing import Dict, Optional
# Price map keyed by canonical model name (lowercased)
# Values are dicts with per-M token prices for input and output tokens
MODEL_PRICES_PER_M: Dict[str, Dict[str, float]] = {
# Use exact actual_model_name keys (lowercased) provided by the user
# Anthropic
"claude-opus-4-1-20250805": {"input": 15.0, "output": 75.0},
"claude-opus-4-5-20251101": {"input": 5.0, "output": 25.0},
"claude-sonnet-4-20250514": {"input": 3.0, "output": 15.0},
"claude-sonnet-4-5-20250929": {"input": 3.0, "output": 15.0},
# DeepSeek
"deepseek-v3.1-non-think": {"input": 0.56, "output": 1.68},
"deepseek-v3.2-chat": {"input": 0.27, "output": 0.40},
"deepseek-v3.2-reasoner": {"input": 0.27, "output": 0.40},
"deepseek-v3.1-terminus-thinking": {"input": 0.21, "output": 0.79},
"deepseek-v3.1-terminus": {"input": 0.21, "output": 0.79},
# Google Gemini
"gemini-2.5-pro": {"input": 2.5, "output": 15.0},
"gemini-2.5-flash": {"input": 0.3, "output": 2.5},
"gemini-3-pro": {"input": 2.0, "output": 12.0},
# Z.AI
"glm-4.5": {"input": 0.33, "output": 1.32},
# OpenAI
"gpt-5-2025-08-07": {"input": 1.25, "output": 10.0},
"gpt-5.2-2025-12-11": {"input": 1.75, "output": 14.0},
"gpt-5-mini-2025-08-07": {"input": 0.25, "output": 2.0},
"gpt-5-nano-2025-08-07": {"input": 0.05, "output": 0.4},
"gpt-4.1-2025-04-14": {"input": 2.0, "output": 8.0},
"gpt-4.1-mini-2025-04-14": {"input": 0.4, "output": 1.6},
"gpt-4.1-nano-2025-04-14": {"input": 0.1, "output": 0.4},
"o3-2025-04-16": {"input": 2.0, "output": 8.0},
"o4-mini-2025-04-16": {"input": 1.1, "output": 4.4},
"gpt-oss-120b": {"input": 0.072, "output": 0.28},
# Qwen
"qwen3-coder-480b-a35b-instruct": {"input": 0.2, "output": 0.8},
"qwen3-max-preview": {"input": 1.2, "output": 6},
# Xai
"grok-4-0709": {"input": 3.0, "output": 15.0},
"grok-code-fast-1": {"input": 0.2, "output": 1.5},
"grok-4-fast": {"input": 0.2, "output": 0.5},
# Moonshot
"kimi-k2-0711-preview": {"input": 0.6, "output": 2.5},
"kimi-k2-0905-preview": {"input": 0.6, "output": 2.5},
}
def normalize_model_name(model_name: str) -> str:
"""Normalize model name for pricing lookup.
Lowercases only.
"""
return (model_name or "").strip().lower()
def get_price_per_m(model_name: str) -> Optional[Dict[str, float]]:
"""Return per-M token prices for given model, or None if unknown."""
key = normalize_model_name(model_name)
return MODEL_PRICES_PER_M.get(key)
def compute_cost_usd(model_name: str, input_tokens: float, output_tokens: float) -> Optional[float]:
"""Compute cost in USD given token usage and model pricing.
Prices are per 1,000,000 tokens. If pricing unknown, returns None.
"""
prices = get_price_per_m(model_name)
if not prices:
return None
input_cost = (input_tokens / 1_000_000.0) * prices["input"]
output_cost = (output_tokens / 1_000_000.0) * prices["output"]
return float(round(input_cost + output_cost, 6))
================================================
FILE: src/base/__init__.py
================================================
================================================
FILE: src/base/login_helper.py
================================================
from abc import ABC, abstractmethod
class BaseLoginHelper(ABC):
"""Abstract base class for login helpers."""
def __init__(self):
pass
@abstractmethod
def login(self, **kwargs):
pass
================================================
FILE: src/base/state_manager.py
================================================
import time
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Dict, List, Optional
from src.logger import get_logger
from .task_manager import BaseTask
# Initialize logger
logger = get_logger(__name__)
@dataclass
class InitialStateInfo:
"""Information about created initial state for a task."""
state_id: str
state_url: Optional[str] = None
metadata: Optional[Dict[str, Any]] = None
class BaseStateManager(ABC):
"""
Simplified abstract base class for state management in MCP services.
This class provides essential functionality for initial state creation and cleanup
while allowing service-specific implementations through template methods.
"""
def __init__(self, service_name: str):
self.service_name = service_name
# Simple resource tracking for cleanup
self.tracked_resources: List[Dict[str, Any]] = []
# Note: Initialization is now handled in service-specific constructors
def set_up(self, task: BaseTask) -> bool:
"""Set up initial state for a specific task.
Args:
task: The task for which to set up the initial state
Returns:
True if setup successful, False otherwise
"""
try:
logger.info(
f"| Setting up initial state for {self.service_name} task: {task.name}"
)
# Create initial state
initial_state_info = self._create_initial_state(task)
if not initial_state_info:
logger.error(f"| Failed to create initial state for {task.name}")
return False
# Store initial state info in task
self._store_initial_state_info(task, initial_state_info)
logger.info(f"| ✓ Initial state setup completed for {task.name}")
return True
except Exception as e:
logger.error(f"| Setup failed for {task.name}: {e}")
return False
def clean_up(self, task: BaseTask = None) -> bool:
"""Clean up resources with common patterns and service-specific hooks.
Args:
task: Optional task to clean up specific resources for
Returns:
True if cleanup successful, False otherwise
"""
try:
cleanup_success = True
# Task-specific cleanup
if task:
logger.info(
f"| ○ Cleaning up initial state for {self.service_name} task: {task.name}"
)
if not self._cleanup_task_initial_state(task):
cleanup_success = False
# Clean up all tracked resources
if not self._cleanup_tracked_resources():
cleanup_success = False
if cleanup_success:
logger.info(f"| ✓ Cleanup completed for {self.service_name}")
else:
logger.warning(
f"| Cleanup completed with some failures for {self.service_name}"
)
return cleanup_success
except Exception as e:
logger.error(f"Cleanup failed for {self.service_name}: {e}")
return False
def track_resource(
self,
resource_type: str,
identifier: str,
metadata: Optional[Dict[str, Any]] = None,
) -> None:
"""Track a resource for later cleanup.
Args:
resource_type: Type of resource (e.g., 'repository', 'page')
identifier: Unique identifier for the resource
metadata: Additional metadata about the resource
"""
resource = {
"type": resource_type,
"id": identifier,
"created_at": time.time(),
"metadata": metadata or {},
}
self.tracked_resources.append(resource)
logger.debug(f"Tracked {resource_type} resource: {identifier}")
def get_service_config_for_agent(self) -> dict:
"""
Get service-specific configuration for agent execution.
This method should be overridden by service implementations that need
to provide additional configuration to the agent.
Returns:
Dictionary containing configuration needed by the agent/MCP server
"""
return {}
def set_verification_environment(self, messages_path: str = None) -> None:
"""
Set environment variables needed for verification scripts.
Args:
messages_path: Optional path to messages.json file for verification
This method can be overridden by service implementations that need
to set specific environment variables for their verification scripts.
The default implementation sets MCP_MESSAGES if provided.
"""
import os
if messages_path:
os.environ["MCP_MESSAGES"] = str(messages_path)
def _cleanup_tracked_resources(self) -> bool:
"""Clean up all tracked resources."""
cleanup_success = True
for resource in self.tracked_resources:
try:
if not self._cleanup_single_resource(resource):
cleanup_success = False
except Exception as e:
logger.error(f"Failed to cleanup resource {resource}: {e}")
cleanup_success = False
# Clear resources after cleanup attempt
self.tracked_resources.clear()
return cleanup_success
# =========================================================================
# Abstract methods for service-specific behavior (simplified)
# =========================================================================
# Note: Service-specific initialization is now handled in constructors
@abstractmethod
def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
"""Create initial state for a task (e.g., duplicate page, fork repo).
Args:
task: Task for which to create initial state
Returns:
InitialStateInfo object or None if creation failed
"""
pass
@abstractmethod
def _store_initial_state_info(
self, task: BaseTask, state_info: InitialStateInfo
) -> None:
"""Store initial state information in the task object.
Args:
task: Task object to update
state_info: Initial state information to store
"""
pass
@abstractmethod
def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
"""Clean up initial state for a specific task.
Args:
task: Task whose initial state should be cleaned up
Returns:
True if cleanup successful, False otherwise
"""
pass
@abstractmethod
def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
"""Clean up a single tracked resource.
Args:
resource: Resource dictionary with type, id, and metadata
Returns:
True if cleanup successful, False otherwise
"""
pass
================================================
FILE: src/base/task_manager.py
================================================
#!/usr/bin/env python3
"""
Enhanced Base Task Manager with Common Task Discovery Logic
===========================================================
This module provides an improved base class for task managers that consolidates
common task discovery patterns while maintaining flexibility for service-specific needs.
"""
import json
import subprocess
import sys
from abc import ABC
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.logger import get_logger
from src.results_reporter import TaskResult
logger = get_logger(__name__)
@dataclass
class BaseTask:
"""Base class for evaluation tasks."""
task_instruction_path: Path
task_verification_path: Path
service: str
category_id: str # From meta.json if available, otherwise directory name
task_id: str # From meta.json if available, otherwise directory name
@property
def name(self) -> str:
"""Return the task name using '__' separator format: 'category_id__task_id'."""
return f"{self.category_id}__{self.task_id}"
def get_task_instruction(self) -> str:
"""Return the full text content of the task instruction file."""
if not self.task_instruction_path.exists():
raise FileNotFoundError(
f"Task instruction file not found: {self.task_instruction_path}"
)
return self.task_instruction_path.read_text(encoding="utf-8")
class BaseTaskManager(ABC):
"""Enhanced base class for service-specific task managers with common discovery logic."""
def __init__(
self,
tasks_root: Path,
mcp_service: str = None,
task_class: type = None,
task_organization: str = None,
task_suite: str | None = "standard",
):
"""Initialize the base task manager.
Args:
tasks_root: Root directory containing all tasks
mcp_service: MCP service name (e.g., 'notion', 'github', 'filesystem')
task_class: Custom task class to use (defaults to BaseTask)
task_organization: 'file' or 'directory' based task organization
task_suite: Logical task suite (e.g., 'standard', 'easy')
"""
self.tasks_root = tasks_root
self.mcp_service = mcp_service or self.__class__.__name__.lower().replace(
"taskmanager", ""
)
self.task_class = task_class or BaseTask
self.task_organization = task_organization
self.task_suite = task_suite
self._tasks_cache = None
# =========================================================================
# Common Task Discovery Implementation
# =========================================================================
def discover_all_tasks(self) -> List[BaseTask]:
"""Discover all available tasks for this service (common implementation)."""
if self._tasks_cache is not None:
return self._tasks_cache
tasks = []
service_dir = self.tasks_root / (
self.mcp_service or self._get_service_directory_name()
)
if self.task_suite:
service_dir = service_dir / self.task_suite
if not service_dir.exists():
logger.warning(
f"{self.mcp_service.title()} tasks directory does not exist: {service_dir}"
)
return tasks
# Scan categories
for category_dir in service_dir.iterdir():
if not self._is_valid_category_dir(category_dir):
continue
category_id = category_dir.name
logger.info("Discovering tasks in category: %s", category_id)
# Find tasks using service-specific logic
task_files = self._find_task_files(category_dir)
for task_files_info in task_files:
task = self._create_task_from_files(category_id, task_files_info)
if task:
tasks.append(task)
logger.debug("Found task: %s", task.name)
# Sort and cache
# Sort by category_id and a stringified task_id to handle both numeric IDs and slugs uniformly
self._tasks_cache = sorted(tasks, key=lambda t: (t.category_id, str(t.task_id)))
logger.info(
"Discovered %d %s tasks across all categories (suite=%s)",
len(self._tasks_cache),
self.mcp_service.title(),
self.task_suite or "default",
)
return self._tasks_cache
def get_categories(self) -> List[str]:
"""Get a list of all task categories (common implementation)."""
tasks = self.discover_all_tasks()
return sorted(list(set(task.category_id for task in tasks)))
def filter_tasks(self, task_filter: str) -> List[BaseTask]:
"""Filter tasks based on category or specific task pattern (common implementation)."""
all_tasks = self.discover_all_tasks()
if not task_filter or task_filter.lower() == "all":
return all_tasks
# Check if it's a category filter
categories = self.get_categories()
if task_filter in categories:
return [task for task in all_tasks if task.category_id == task_filter]
# Check for specific task pattern (category_id/task_id)
if "/" in task_filter:
try:
category, task_part = task_filter.split("/", 1)
# First try to match by task_id (could be numeric or string)
for task in all_tasks:
if task.category_id == category:
# Check if task_id matches (as string or as specific pattern)
if str(task.task_id) == task_part:
return [task]
except (ValueError, IndexError):
pass
# Fallback: check for partial matches in task names or categories
filtered_tasks = []
for task in all_tasks:
if (
task_filter in task.category_id
or task_filter in task.name
or task_filter == str(task.task_id)
):
filtered_tasks.append(task)
return filtered_tasks
# =========================================================================
# Common Helper Methods
# =========================================================================
def get_task_instruction(self, task: BaseTask) -> str:
"""Get formatted task instruction (template method)."""
base_instruction = self._read_task_instruction(task)
return self._format_task_instruction(base_instruction)
def execute_task(self, task: BaseTask, agent_result: Dict[str, Any]) -> TaskResult:
"""Execute task verification (template method)."""
logger.info(f"| Verifying task ({self.mcp_service.title()}): {task.name}")
# Track agent success separately
agent_success = agent_result.get("success", False)
agent_error = None
verification_success = False
verification_error = None
verification_output = None
# Handle agent failure (but still continue to verification)
if not agent_success:
agent_error = agent_result.get("error", "Agent execution failed")
# Standardize MCP network errors
agent_error = self._standardize_error_message(agent_error)
logger.error(f"| ✗ Agent execution failed for task")
logger.error(f"| ⚠️ Error: {agent_error}")
logger.info(f"| - Proceeding with verification despite agent failure")
try:
# Always run verification regardless of agent success
verify_result = self.run_verification(task)
# Process verification results
verification_success = verify_result.returncode == 0
verification_output = verify_result.stdout
# Log verification output
if verification_output:
print(verification_output)
# Capture verification error if failed
if not verification_success:
verification_error = verify_result.stderr if verify_result.stderr else "Verification failed with no error message"
if verification_success:
logger.info(f"| Verification Result: \033[92m✓ PASSED\033[0m")
else:
logger.error(f"| Verification Result: \033[91m✗ FAILED\033[0m")
if verification_error:
logger.error(f"| Verification Error: {verification_error}")
return TaskResult(
task_name=task.name,
success=verification_success,
error_message=agent_error, # Agent execution error
verification_error=verification_error, # Verification error
verification_output=verification_output, # Verification output
model_output=agent_result.get("output", ""),
category_id=task.category_id,
task_id=task.task_id,
token_usage=agent_result.get("token_usage", {}),
turn_count=agent_result.get("turn_count", -1),
)
except Exception as e:
logger.error(f"| Task verification failed: {e}", exc_info=True)
return TaskResult(
task_name=task.name,
success=False,
error_message=agent_error, # Keep agent error if any
verification_error=str(e), # Verification exception
verification_output=None,
category_id=task.category_id,
task_id=task.task_id,
model_output=agent_result.get("output", ""),
token_usage=agent_result.get("token_usage", {}),
turn_count=agent_result.get("turn_count", 0),
)
def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
"""Run the verification script for a task (can be overridden).
Default implementation runs the verification command.
Services can override this to add environment variables or custom logic.
"""
return subprocess.run(
self._get_verification_command(task),
capture_output=True, # Capture stdout and stderr for logging
text=True,
timeout=300,
)
# =========================================================================
# Abstract Methods - Minimal Set Required
# =========================================================================
def _get_service_directory_name(self) -> str:
"""Return the service directory name (e.g., 'notion', 'github').
Default implementation uses the service parameter if provided.
"""
if self.mcp_service:
return self.mcp_service
raise NotImplementedError(
"Must provide service parameter or implement _get_service_directory_name"
)
def _get_task_organization(self) -> str:
"""Return task organization type: 'directory' or 'file'.
- 'directory': Tasks organized as task_X/description.md (Notion)
- 'file': Tasks organized as task_X.md (GitHub, Filesystem)
Default implementation uses the task_organization parameter if provided.
"""
if self.task_organization:
return self.task_organization
raise NotImplementedError(
"Must provide task_organization parameter or implement _get_task_organization"
)
# Note: _create_task_instance is no longer needed - use task_class parameter instead
# =========================================================================
# Hook Methods with Smart Defaults
# =========================================================================
def _is_valid_category_dir(self, category_dir: Path) -> bool:
"""Check if a directory is a valid category directory."""
return (
category_dir.is_dir()
and not category_dir.name.startswith(".")
and category_dir.name != "utils"
and category_dir.name != "__pycache__"
)
def _find_task_files(self, category_dir: Path) -> List[Dict[str, Any]]:
"""Find task files in a category directory (smart default implementation).
Automatically handles both directory-based and file-based organization.
"""
task_files: List[Dict[str, Any]] = []
for task_dir in category_dir.iterdir():
# Skip anything that is not a directory or is hidden
if not task_dir.is_dir() or task_dir.name.startswith("."):
continue
description_path = task_dir / "description.md"
verify_path = task_dir / "verify.py"
# We consider a directory a valid task only if the two mandatory files exist
if not (description_path.exists() and verify_path.exists()):
logger.warning(
"Skipping %s – missing description.md or verify.py", task_dir
)
continue
task_files.append(
{
"task_id": task_dir.name,
"instruction_path": description_path,
"verification_path": verify_path,
}
)
return task_files
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> Optional[BaseTask]:
"""Create a task from file information with meta.json support."""
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
# Default to directory names
task_id = task_files_info["task_id"]
final_category_id = category_id
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
return self.task_class(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service=self.mcp_service,
category_id=final_category_id,
task_id=task_id,
)
def _read_task_instruction(self, task: BaseTask) -> str:
"""Read and return the task instruction content."""
return task.get_task_instruction()
def _format_task_instruction(self, base_instruction: str) -> str:
"""Format task instruction with Notion-specific additions."""
return (
base_instruction
+ "\n\nNote: Based on your understanding, solve the task all at once by yourself, don't ask for my opinions on anything."
)
def _get_verification_command(self, task: BaseTask) -> List[str]:
"""Get the command to run task verification (default implementation)."""
return [sys.executable, str(task.task_verification_path)]
def _standardize_error_message(self, error_message: str) -> str:
"""Standardize error messages for consistent reporting."""
from src.errors import standardize_error_message
return standardize_error_message(error_message, mcp_service=self.mcp_service)
================================================
FILE: src/config/__init__.py
================================================
================================================
FILE: src/config/config_schema.py
================================================
#!/usr/bin/env python3
"""
Centralized Configuration Schema for MCPMark
=============================================
This module provides a unified configuration system with validation,
type safety, and support for multiple configuration sources.
"""
import os
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional
import yaml
from dotenv import load_dotenv
from src.logger import get_logger
logger = get_logger(__name__)
# Lazy import to avoid circular dependencies
def get_service_definition(service_name: str) -> dict:
from src.services import get_service_definition as _get_service_def
return _get_service_def(service_name)
@dataclass
class ConfigValue:
"""Represents a configuration value with metadata."""
key: str
value: Any
source: str # 'env', 'file', 'default'
required: bool = True
description: str = ""
validator: Optional[callable] = None
def validate(self) -> bool:
"""Validate the configuration value."""
if self.required and self.value is None:
raise ValueError(f"Required configuration '{self.key}' is missing")
if self.validator and self.value is not None:
if not self.validator(self.value):
raise ValueError(f"Invalid value for '{self.key}': {self.value}")
return True
class ConfigSchema(ABC):
"""Abstract base class for service configuration schemas."""
def __init__(self, service_name: str):
self.service_name = service_name
self._values: Dict[str, ConfigValue] = {}
self._load_dotenv()
self._define_schema()
self._load_values()
self._validate()
@abstractmethod
def _define_schema(self) -> None:
"""Define the configuration schema for this service."""
pass
def _load_dotenv(self) -> None:
"""Load environment variables from .mcp_env file."""
load_dotenv(dotenv_path=".mcp_env", override=False)
def _add_config(
self,
key: str,
env_var: Optional[str] = None,
default: Any = None,
required: bool = True,
description: str = "",
validator: Optional[callable] = None,
transform: Optional[callable] = None,
) -> None:
"""Add a configuration value to the schema."""
# Try to get value from environment first
value = None
source = "default"
if env_var:
env_value = os.getenv(env_var)
if env_value is not None:
value = transform(env_value) if transform else env_value
source = "env"
# Use default if no environment value
if value is None and default is not None:
value = default
source = "default"
self._values[key] = ConfigValue(
key=key,
value=value,
source=source,
required=required,
description=description,
validator=validator,
)
def _load_values(self) -> None:
"""Load configuration values from file if available."""
config_file = Path(f"config/{self.service_name}.yaml")
if config_file.exists():
with open(config_file) as f:
file_config = yaml.safe_load(f)
for key, value in file_config.items():
if key in self._values and self._values[key].value is None:
self._values[key].value = value
self._values[key].source = "file"
def _validate(self) -> None:
"""Validate all configuration values."""
for config_value in self._values.values():
config_value.validate()
def get(self, key: str, default: Any = None) -> Any:
"""Get a configuration value."""
if key in self._values:
return self._values[key].value
return default
def get_all(self) -> Dict[str, Any]:
"""Get all configuration values as a dictionary."""
return {k: v.value for k, v in self._values.items()}
def get_debug_info(self) -> Dict[str, Dict[str, Any]]:
"""Get detailed configuration information for debugging."""
return {
k: {
"value": v.value,
"source": v.source,
"required": v.required,
"description": v.description,
}
for k, v in self._values.items()
}
class GenericConfigSchema(ConfigSchema):
"""Generic configuration schema that reads from service definitions."""
def __init__(self, service_name: str):
# Get service definition before calling parent init
self.service_definition = get_service_definition(service_name)
super().__init__(service_name)
def _define_schema(self) -> None:
"""Define schema from service definition."""
config_schema = self.service_definition.get("config_schema", {})
for key, config in config_schema.items():
# Handle transform strings
transform = None
transform_str = config.get("transform")
if transform_str == "bool":
transform = lambda x: x.lower() in ["true", "1", "yes"]
elif transform_str == "int":
transform = int
elif transform_str == "path":
transform = lambda x: Path(x) if x else None
elif transform_str == "list":
transform = lambda x: [t.strip() for t in x.split(",")] if x else []
# Handle validator strings
validator = None
validator_str = config.get("validator")
if validator_str == "port":
validator = lambda x: 1 <= x <= 65535
elif validator_str and validator_str.startswith("in:"):
valid_values = validator_str[3:].split(",")
validator = lambda x, values=valid_values: x in values
self._add_config(
key=key,
env_var=config.get("env_var"),
default=config.get("default"),
required=config.get("required", True),
description=config.get("description", ""),
validator=validator,
transform=transform,
)
# Configuration Registry
class ConfigRegistry:
"""Central registry for all service configurations."""
_instances: Dict[str, ConfigSchema] = {}
@classmethod
def get_config(cls, service_name: str) -> ConfigSchema:
"""Get or create configuration for a service."""
if service_name not in cls._instances:
cls._instances[service_name] = GenericConfigSchema(service_name)
return cls._instances[service_name]
@classmethod
def validate_all(cls) -> Dict[str, bool]:
"""Validate all registered configurations."""
from src.services import get_supported_mcp_services
results = {}
for service_name in get_supported_mcp_services():
try:
cls.get_config(service_name)
results[service_name] = True
except Exception as e:
logger.error(f"Configuration validation failed for {service_name}: {e}")
results[service_name] = False
return results
@classmethod
def export_template(cls, service_name: str, output_path: Path) -> None:
"""Export a configuration template for a service."""
config = cls.get_config(service_name)
template = {"service": service_name, "configuration": {}}
for key, config_value in config._values.items():
template["configuration"][key] = {
"value": config_value.value
if config_value.source == "default"
else None,
"description": config_value.description,
"required": config_value.required,
"env_var": f"${{{key.upper()}}}",
}
with open(output_path, "w") as f:
yaml.dump(template, f, default_flow_style=False, sort_keys=False)
# Utility Functions
def get_service_config(service_name: str) -> Dict[str, Any]:
"""Get service configuration as a dictionary."""
return ConfigRegistry.get_config(service_name).get_all()
================================================
FILE: src/errors.py
================================================
#!/usr/bin/env python3
"""
Simple Error Handling for MCPMark
==================================
Provides basic error standardization and retry logic.
"""
from typing import Optional
"""Retryable error detection via minimal substring matching (lower-case)."""
# Keep this list short and generic; aim to catch API/infrastructure issues only.
RETRYABLE_PATTERNS = {
"ratelimit", # e.g., RateLimitError, too many requests
# "connection", # connection refused/reset/error
"agent execution failed",
"unavailable", # service unavailable
# "execution timed out", # timeout
"internal server error", # 500s
"network error", # generic network issue
"quota", # budget/quota exceeded
# "llm provider not provided", # litellm error
# pipeline infra signals
"account balance",
"mcp network error",
"state duplication error",
"thought_signature",
"overloaded."
}
def is_retryable_error(error: str) -> bool:
"""Return True if the error string contains any retryable pattern."""
error_lower = str(error or "").lower()
return any(pattern in error_lower for pattern in RETRYABLE_PATTERNS)
def standardize_error_message(error: str, mcp_service: Optional[str] = None) -> str:
"""Standardize error messages for consistent reporting."""
error_str = str(error).strip()
# Common standardizations
if "timeout" in error_str.lower():
base_msg = "Operation timed out"
elif (
"connection refused" in error_str.lower() or "econnrefused" in error_str.lower()
):
base_msg = "Connection refused"
elif "not found" in error_str.lower():
base_msg = "Resource not found"
elif "already exists" in error_str.lower():
base_msg = "Resource already exists"
else:
# Return original message if no standardization applies
return error_str
# Add MCP service prefix if provided
if mcp_service:
return f"{mcp_service.title()} {base_msg}"
return base_msg
================================================
FILE: src/evaluator.py
================================================
import time
import json
import shutil
from datetime import datetime
from pathlib import Path
from typing import List, Optional
from src.logger import get_logger
from src.factory import MCPServiceFactory
from src.model_config import ModelConfig
from src.results_reporter import EvaluationReport, ResultsReporter, TaskResult
from src.errors import is_retryable_error
from src.agents import AGENT_REGISTRY
# Initialize logger
logger = get_logger(__name__)
class MCPEvaluator:
def __init__(
self,
mcp_service: str,
model: str,
timeout: int = 300,
exp_name: str = "test-run",
output_dir: Path = None,
reasoning_effort: str = "default",
agent_name: str = "mcpmark",
task_suite: str = "standard",
compaction_token: int = 0,
):
# Main configuration
self.mcp_service = mcp_service
self.timeout = timeout
self.agent_name = (agent_name or "mcpmark").lower()
self.task_suite = (task_suite or "standard").lower()
if self.agent_name not in AGENT_REGISTRY:
raise ValueError(f"Unsupported agent '{agent_name}'. Available: {sorted(AGENT_REGISTRY)}")
# Initialize model configuration
self.reasoning_effort = reasoning_effort
self.model_name = model
model_config = ModelConfig(self.model_name)
self.api_key = model_config.api_key
self.base_url = model_config.base_url
self.litellm_input_model_name = model_config.litellm_input_model_name
# Track the actual model name from LiteLLM responses
self.litellm_run_model_name = None
# Initialize managers using the factory pattern (simplified)
self.task_manager = MCPServiceFactory.create_task_manager(
mcp_service, task_suite=self.task_suite
)
self.state_manager = MCPServiceFactory.create_state_manager(mcp_service)
# Obtain static service configuration from state manager (e.g., notion_key)
self.service_config = self.state_manager.get_service_config_for_agent()
# Initialize agent for LLM and MCP server management. The agent will
# automatically refresh its service configuration from the state
# manager before each execution, so per-task manual updates are no
# longer needed.
agent_cls = AGENT_REGISTRY[self.agent_name]
self.agent = agent_cls(
litellm_input_model_name=self.litellm_input_model_name,
api_key=self.api_key,
base_url=self.base_url,
mcp_service=mcp_service,
timeout=timeout,
service_config=self.service_config,
service_config_provider=self.state_manager.get_service_config_for_agent,
reasoning_effort=self.reasoning_effort,
compaction_token=compaction_token,
)
# Initialize results reporter
self.results_reporter = ResultsReporter()
# Output directory handling
if self.reasoning_effort != "default":
model_slug = self.model_name.replace(".", "-") + "-" + self.reasoning_effort
else:
model_slug = self.model_name.replace(".", "-")
service_for_dir = "playwright" if mcp_service == "playwright_webarena" else mcp_service
suite_suffix = "" if self.task_suite in ("standard", "", None) else f"-{self.task_suite}"
service_dir_name = f"{service_for_dir}{suite_suffix}"
self.base_experiment_dir = output_dir / f"{model_slug}__{service_dir_name}" / exp_name
self.base_experiment_dir.mkdir(parents=True, exist_ok=True)
def _format_duration(self, seconds: float) -> str:
"""Format duration: <1s as ms, otherwise seconds."""
return f"{(seconds * 1000):.2f}ms" if seconds < 1 else f"{seconds:.2f}s"
def _get_task_output_dir(self, task) -> Path:
"""Return the directory path for storing this task's reports using '__' separator."""
# Use category_id and task_id with '__' separator
category_id = task.category_id if task.category_id else "uncategorized"
task_id = str(task.task_id)
return self.base_experiment_dir / f"{category_id}__{task_id}"
# ------------------------------------------------------------------
# Resuming helpers
# ------------------------------------------------------------------
def _load_latest_task_result(self, task) -> Optional[TaskResult]:
"""Return the most recent TaskResult for *task* if it has been run before."""
task_dir = self._get_task_output_dir(task)
if not task_dir.exists():
return None
meta_path = task_dir / "meta.json"
if not meta_path.exists():
return None
try:
with meta_path.open("r", encoding="utf-8") as f:
meta_data = json.load(f)
return TaskResult(
task_name=meta_data["task_name"],
success=meta_data["execution_result"]["success"],
error_message=meta_data["execution_result"].get("error_message"),
verification_error=meta_data["execution_result"].get("verification_error"),
verification_output=meta_data["execution_result"].get("verification_output"),
category_id=task.category_id,
task_id=task.task_id,
model_output=None,
token_usage=meta_data.get("token_usage", {}),
turn_count=meta_data.get("turn_count"),
agent_execution_time=meta_data.get("agent_execution_time", 0.0),
task_execution_time=meta_data.get("task_execution_time", 0.0),
)
except Exception as exc:
logger.warning("Failed to load existing result for %s: %s", task.name, exc)
return None
def _gather_all_task_results(self) -> List[TaskResult]:
"""Scan *all* task sub-directories and collect the latest TaskResult from each."""
results: list[TaskResult] = []
if not self.base_experiment_dir.exists():
return results
for task_dir in self.base_experiment_dir.iterdir():
if not task_dir.is_dir():
continue
meta_path = task_dir / "meta.json"
if not meta_path.exists():
continue
try:
with meta_path.open("r", encoding="utf-8") as f:
meta_data = json.load(f)
category_id, task_id = task_dir.name.split("__", 1)
result = TaskResult(
task_name=meta_data["task_name"],
success=meta_data["execution_result"]["success"],
error_message=meta_data["execution_result"].get("error_message"),
verification_error=meta_data["execution_result"].get("verification_error"),
verification_output=meta_data["execution_result"].get("verification_output"),
category_id=category_id,
task_id=task_id,
model_output=None,
token_usage=meta_data.get("token_usage", {}),
turn_count=meta_data.get("turn_count"),
agent_execution_time=meta_data.get("agent_execution_time", 0.0),
task_execution_time=meta_data.get("task_execution_time", 0.0),
)
results.append(result)
except Exception as exc:
logger.warning(
"Failed to parse existing report in %s: %s", task_dir, exc
)
return results
def _run_single_task(self, task) -> TaskResult:
"""
Runs a single task, including setup, agent execution, verification, and cleanup.
"""
# Track overall task start time
task_start_time = time.time()
# ------------------------------------------------------------------
# Stage 1: Set up the initial state for the task
# ------------------------------------------------------------------
setup_start_time = time.time()
logger.info(
"\n┌─ Stage 1: Setup ─────────────────────────────────────────────────────"
)
setup_success = self.state_manager.set_up(task)
setup_time = time.time() - setup_start_time
if not setup_success:
logger.error(f"| State setup failed for task: {task.name}")
task_total_time = time.time() - task_start_time
return TaskResult(
task_name=task.name,
success=False,
error_message="State Duplication Error",
verification_error=None,
verification_output=None,
category_id=task.category_id,
task_id=task.task_id,
agent_execution_time=0.0,
task_execution_time=task_total_time,
)
display_time = self._format_duration(setup_time)
logger.info(f"└─ Completed in {display_time}\n")
# ------------------------------------------------------------------
# Stage 2: Execute the task using the agent
# ------------------------------------------------------------------
logger.info(
"┌─ Stage 2: Execute ───────────────────────────────────────────────────"
)
agent_execution_start_time = time.time()
# Get task instruction from task manager
task_instruction = self.task_manager.get_task_instruction(task)
# Prepare task_output_dir and tool call log file
task_output_dir = self._get_task_output_dir(task)
task_output_dir.mkdir(parents=True, exist_ok=True)
execution_log_path = task_output_dir / "execution.log"
# Remove existing execution.log to ensure clean start
if execution_log_path.exists():
execution_log_path.unlink()
# Execute with agent
agent_result = self.agent.execute_sync(
task_instruction, str(execution_log_path)
)
agent_execution_time = time.time() - agent_execution_start_time
# Extract actual model name from LiteLLM response
if agent_result.get("litellm_run_model_name"):
self.litellm_run_model_name = agent_result["litellm_run_model_name"]
# Write messages.json to task_output_dir
messages_path = task_output_dir / "messages.json"
self.results_reporter.save_messages_json(
agent_result.get("output", []), messages_path
)
# Set service-specific environment variables for verification scripts
self.state_manager.set_verification_environment(str(messages_path))
logger.info(f"└─ Completed in {self._format_duration(agent_execution_time)}\n")
# ------------------------------------------------------------------
# Stage 3: Verify
# ------------------------------------------------------------------
logger.info(
"┌─ Stage 3: Verify ────────────────────────────────────────────────────"
)
verify_start_time = time.time()
try:
result = self.task_manager.execute_task(task, agent_result)
finally:
# Clean up environment variables
import os
os.environ.pop("MCP_MESSAGES", None)
os.environ.pop("MCP_GITHUB_TOKEN", None)
verify_time = time.time() - verify_start_time
logger.info(f"└─ Completed in {self._format_duration(verify_time)}\n")
# ------------------------------------------------------------------
# Stage 4: Clean up
# ------------------------------------------------------------------
logger.info(
"┌─ Stage 4: Cleanup ───────────────────────────────────────────────────"
)
cleanup_start_time = time.time()
self.state_manager.clean_up(task)
cleanup_time = time.time() - cleanup_start_time
logger.info(f"└─ Completed in {self._format_duration(cleanup_time)}\n")
# Calculate total task execution time
task_total_time = time.time() - task_start_time
# Add timing information to the result
result.agent_execution_time = agent_execution_time
result.task_execution_time = task_total_time
return result
def run_evaluation(self, task_filter: str) -> EvaluationReport:
"""
Runs the full evaluation for the specified tasks.
"""
tasks = self.task_manager.filter_tasks(task_filter)
results = []
for task in tasks:
# --------------------------------------------------------------
# Resume check
# --------------------------------------------------------------
existing_result = self._load_latest_task_result(task)
# Decide whether to skip or retry this task
retry_due_to_error = (
existing_result is not None
and not existing_result.success
and is_retryable_error(existing_result.error_message)
)
if existing_result and not retry_due_to_error:
# Existing result is either successful or failed with a non-retryable error – skip.
logger.info(
"↩️ Skipping already-completed task (resume): %s", task.name
)
results.append(existing_result)
continue
if retry_due_to_error:
# Clean previous artifacts so that new results fully replace them.
task_output_dir = self._get_task_output_dir(task)
if task_output_dir.exists():
shutil.rmtree(task_output_dir)
logger.info(
"🔄 Retrying task due to pipeline error (%s): %s",
existing_result.error_message,
task.name,
)
# --------------------------------------------------------------
# Execute new task
# --------------------------------------------------------------
task_start = time.time()
task_result = self._run_single_task(task)
task_end = time.time()
results.append(task_result)
# Prepare directory & save
task_output_dir = self._get_task_output_dir(task)
task_output_dir.mkdir(parents=True, exist_ok=True)
# Save messages.json (conversation trajectory)
messages_path = task_output_dir / "messages.json"
if not messages_path.exists(): # 已经写过就跳过
messages = (
task_result.model_output
if getattr(task_result, "model_output", None)
else []
)
self.results_reporter.save_messages_json(messages, messages_path)
# Save meta.json (all other metadata)
meta_path = task_output_dir / "meta.json"
model_config = {
"mcp_service": self.mcp_service,
"model_name": self.model_name,
"litellm_run_model_name": self.litellm_run_model_name,
"reasoning_effort": self.reasoning_effort,
"timeout": self.timeout,
"agent_name": self.agent_name,
}
self.results_reporter.save_meta_json(
task_result,
model_config,
datetime.fromtimestamp(task_start),
datetime.fromtimestamp(task_end),
meta_path,
)
# --------------------------------------------------------------
# Aggregate results – combine current `results` with any previously
# saved TaskResults that ALSO match the current task_filter.
# --------------------------------------------------------------
# Helper: determine if a TaskResult matches the filter string
def _matches_filter(tr: TaskResult, flt: str) -> bool:
if flt.lower() == "all":
return True
if "/" in flt:
# specific task (category_id/task_id)
category_id, task_id = flt.split("/", 1)
return tr.category_id == category_id and str(tr.task_id) == task_id
# category level
return tr.category_id == flt
# Pull existing reports from disk and merge
existing_results = [
r
for r in self._gather_all_task_results()
if _matches_filter(r, task_filter)
]
# Merge, giving preference to fresh `results` (avoids duplicates)
merged: dict[str, TaskResult] = {r.task_name: r for r in existing_results}
merged.update({r.task_name: r for r in results}) # overwrite with latest run
final_results = list(merged.values())
aggregated_report = EvaluationReport(
model_name=self.model_name,
model_config={
"mcp_service": self.mcp_service,
"model_name": self.model_name,
"litellm_run_model_name": self.litellm_run_model_name,
"reasoning_effort": self.reasoning_effort,
"timeout": self.timeout,
"agent_name": self.agent_name,
},
total_tasks=len(final_results),
successful_tasks=sum(1 for r in final_results if r.success),
failed_tasks=sum(1 for r in final_results if not r.success),
task_results=final_results,
tasks_filter=task_filter,
)
# Save model-level summary
summary_path = self.base_experiment_dir / "summary.json"
self.results_reporter.save_model_summary(aggregated_report, summary_path)
logger.info(
"\n============================================================"
"\nResults Summary"
"\n============================================================"
)
logger.info(
f"✓ Tasks passed: {aggregated_report.successful_tasks}/{aggregated_report.total_tasks} ({aggregated_report.success_rate:.1f}%)"
)
logger.info(f"⏱ Total time: {aggregated_report.total_task_execution_time:.1f}s")
return aggregated_report
================================================
FILE: src/factory.py
================================================
#!/usr/bin/env python3
"""
MCP Service Factory for MCPMark
=================================
This module provides a simplified factory pattern for creating service-specific managers
with centralized configuration management.
Features:
- Dynamic service loading from definitions
- Centralized configuration
- Simplified service registration
"""
import importlib
from dataclasses import dataclass
from typing import Dict, Type
from src.base.login_helper import BaseLoginHelper
from src.base.state_manager import BaseStateManager
from src.base.task_manager import BaseTaskManager
from src.config.config_schema import ConfigRegistry
from src.services import get_service_definition, get_supported_mcp_services
@dataclass
class ServiceComponents:
"""All components required for an MCP service."""
task_manager_class: Type[BaseTaskManager]
state_manager_class: Type[BaseStateManager]
login_helper_class: Type[BaseLoginHelper]
config_mapping: Dict[str, Dict[str, str]]
def import_class(module_path: str):
"""Dynamically import a class from module path string."""
if not module_path:
return None
module_name, class_name = module_path.rsplit(".", 1)
module = importlib.import_module(module_name)
return getattr(module, class_name)
def apply_config_mapping(config: dict, mapping: dict) -> dict:
"""Apply config mapping to transform config keys to constructor params."""
if not mapping:
return {}
result = {}
for param_name, config_key in mapping.items():
if config_key in config:
result[param_name] = config[config_key]
return result
class ServiceRegistry:
"""Central registry that loads MCP services from definitions."""
# Cache for loaded components
_components_cache: Dict[str, ServiceComponents] = {}
@classmethod
def get_components(cls, service_name: str) -> ServiceComponents:
"""Get MCP service components from definition."""
if service_name in cls._components_cache:
return cls._components_cache[service_name]
definition = get_service_definition(service_name)
# Import classes dynamically
components = ServiceComponents(
task_manager_class=import_class(definition["components"]["task_manager"]),
state_manager_class=import_class(definition["components"]["state_manager"]),
login_helper_class=import_class(definition["components"]["login_helper"]),
config_mapping=definition.get("config_mapping", {}),
)
cls._components_cache[service_name] = components
return components
class GenericServiceFactory:
"""Generic factory that works with any MCP service."""
def __init__(self, components: ServiceComponents, service_name: str):
self.components = components
self.service_name = service_name
def create_task_manager(self, **kwargs) -> BaseTaskManager:
"""Create task manager instance."""
return self.components.task_manager_class(**kwargs)
def create_state_manager(self, config) -> BaseStateManager:
"""Create state manager with config mapping."""
mapping = self.components.config_mapping.get("state_manager", {})
# Handle both dict and config schema objects
config_dict = config.get_all() if hasattr(config, "get_all") else config
kwargs = apply_config_mapping(config_dict, mapping)
return self.components.state_manager_class(**kwargs)
def create_login_helper(self, config) -> BaseLoginHelper:
"""Create login helper with config mapping."""
mapping = self.components.config_mapping.get("login_helper", {})
# Handle both dict and config schema objects
config_dict = config.get_all() if hasattr(config, "get_all") else config
kwargs = apply_config_mapping(config_dict, mapping)
# Special handling for GitHub login helper - it needs a single token
if self.service_name == "github" and "token" in kwargs:
tokens_list = kwargs["token"]
if isinstance(tokens_list, list) and tokens_list:
kwargs["token"] = tokens_list[0] # Use first token for login helper
return self.components.login_helper_class(**kwargs)
class MCPServiceFactory:
"""Main factory interface."""
@classmethod
def create_service_config(cls, service_name: str):
"""Create MCP service configuration (backward compatible)."""
config = ConfigRegistry.get_config(service_name)
# Create a backward-compatible ServiceConfig-like object
class ServiceConfigCompat:
def __init__(self, service_name: str, config_dict: dict):
self.service_name = service_name
self.config = config_dict
self.api_key = config_dict.get("api_key")
return ServiceConfigCompat(service_name, config.get_all())
@classmethod
def create_task_manager(cls, service_name: str, **kwargs) -> BaseTaskManager:
"""Create task manager for the specified MCP service."""
components = ServiceRegistry.get_components(service_name)
return components.task_manager_class(**kwargs)
@classmethod
def create_state_manager(cls, service_name: str, **kwargs) -> BaseStateManager:
"""Create state manager for the specified MCP service."""
components = ServiceRegistry.get_components(service_name)
config = ConfigRegistry.get_config(service_name).get_all()
# Use provided kwargs or apply config mapping
if not kwargs:
mapping = components.config_mapping.get("state_manager", {})
kwargs = apply_config_mapping(config, mapping)
return components.state_manager_class(**kwargs)
@classmethod
def create_login_helper(cls, service_name: str, **kwargs) -> BaseLoginHelper:
"""Create login helper for the specified MCP service."""
components = ServiceRegistry.get_components(service_name)
config = ConfigRegistry.get_config(service_name).get_all()
# Use provided kwargs or apply config mapping
if not kwargs:
mapping = components.config_mapping.get("login_helper", {})
kwargs = apply_config_mapping(config, mapping)
# Special handling for GitHub login helper - it needs a single token
if service_name == "github" and "token" in kwargs:
tokens_list = kwargs["token"]
if isinstance(tokens_list, list) and tokens_list:
kwargs["token"] = tokens_list[0] # Use first token for login helper
return components.login_helper_class(**kwargs)
@classmethod
def get_supported_mcp_services(cls) -> list:
"""Get list of supported MCP services."""
return get_supported_mcp_services()
@classmethod
def get_config_info(cls, service_name: str) -> dict:
"""Get detailed configuration information for debugging."""
config = ConfigRegistry.get_config(service_name)
return config.get_debug_info()
@classmethod
def export_config_template(cls, service_name: str, output_path: str) -> None:
"""Export a configuration template for an MCP service."""
from pathlib import Path
ConfigRegistry.export_template(service_name, Path(output_path))
================================================
FILE: src/logger.py
================================================
#!/usr/bin/env python3
"""Logger configuration for MCPMark."""
import logging
import sys
def get_logger(name: str) -> logging.Logger:
"""Get a configured logger instance."""
logger = logging.getLogger(name)
if not logger.handlers:
handler = logging.StreamHandler(sys.stdout)
formatter = logging.Formatter("%(message)s")
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
return logger
================================================
FILE: src/mcp_services/filesystem/__init__.py
================================================
"""
Filesystem MCP Service for MCPMark
===================================
This module provides filesystem-specific MCP server integration for MCPMark evaluation.
Uses the official filesystem MCP server for local file operations.
"""
from .filesystem_login_helper import FilesystemLoginHelper
from .filesystem_state_manager import FilesystemStateManager
from .filesystem_task_manager import FilesystemTaskManager, FilesystemTask
__all__ = [
"FilesystemLoginHelper",
"FilesystemStateManager",
"FilesystemTaskManager",
"FilesystemTask",
]
================================================
FILE: src/mcp_services/filesystem/filesystem_login_helper.py
================================================
"""
Filesystem Login Helper for MCPMark
====================================
This module provides a minimal login helper for the filesystem MCP service.
Since filesystem operations don't require authentication, this is a simple
pass-through implementation that satisfies the interface requirements.
"""
from pathlib import Path
from typing import Optional
from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger
logger = get_logger(__name__)
class FilesystemLoginHelper(BaseLoginHelper):
"""
Login helper for filesystem MCP service.
The filesystem MCP server doesn't require authentication, so this
implementation simply returns success for all login operations.
"""
def __init__(self, state_path: Optional[Path] = None):
"""
Initialize the filesystem login helper.
Args:
state_path: Path to save state (not used for filesystem)
"""
super().__init__()
self.state_path = (
state_path or Path.home() / ".mcpmark" / "filesystem_state.json"
)
logger.info("Initialized FilesystemLoginHelper (no auth required)")
def login(self, **kwargs) -> bool:
"""
Perform login operation.
Since filesystem doesn't require authentication, this always returns True.
Returns:
bool: Always True for filesystem service
"""
logger.info("Filesystem service does not require authentication")
return True
def is_authenticated(self) -> bool:
"""
Check if authenticated.
Returns:
bool: Always True for filesystem service
"""
return True
def get_credentials(self) -> dict:
"""
Get credentials for the service.
Returns:
dict: Empty dict as no credentials needed
"""
return {}
================================================
FILE: src/mcp_services/filesystem/filesystem_state_manager.py
================================================
"""
Filesystem State Manager for MCPMark
=====================================
This module handles filesystem state management for consistent task evaluation.
It manages test directories, file creation/cleanup, and environment isolation.
"""
import os
import shutil
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.base.state_manager import BaseStateManager
from src.base.task_manager import BaseTask
from src.logger import get_logger
logger = get_logger(__name__)
class FilesystemStateManager(BaseStateManager):
"""
Manages filesystem state for task evaluation.
This includes creating isolated test directories, tracking created resources,
and cleaning up after task completion.
"""
def _get_project_root(self) -> Path:
"""Find project root by looking for marker files."""
current = Path(__file__).resolve()
# Look for project root markers
for parent in current.parents:
if (parent / "pyproject.toml").exists() or (parent / "pipeline.py").exists():
return parent
# Fallback to old method if markers not found
return Path(__file__).parent / "../../../"
def __init__(self, test_root: Optional[Path] = None, cleanup_on_exit: bool = False):
"""
Initialize filesystem state manager.
Args:
test_root: Root directory for test operations (from FILESYSTEM_TEST_ROOT env var)
cleanup_on_exit: Whether to clean up test directories after tasks (default False for persistent environment)
"""
super().__init__(service_name="filesystem")
# Use provided test root or default to persistent test environment
if test_root:
self.test_root = Path(test_root)
else:
# Default to persistent test environment
project_root = self._get_project_root()
self.test_root = (project_root / "test_environments/desktop").resolve()
self.cleanup_on_exit = cleanup_on_exit
self.current_task_dir: Optional[Path] = None
self.created_resources: List[Path] = []
# Backup and restore functionality
self.backup_dir: Optional[Path] = None
self.backup_enabled = (
True # Enable backup/restore by default for task isolation
)
logger.info(
f"Initialized FilesystemStateManager with persistent test environment: {self.test_root}"
)
def initialize(self, **kwargs) -> bool:
"""
Initialize the filesystem environment.
Ensures the persistent test environment exists and is accessible.
Returns:
bool: True if initialization successful
"""
try:
# Ensure test environment directory exists
if not self.test_root.exists():
logger.error(f"Persistent test environment not found: {self.test_root}")
logger.error(
"Please ensure test_environments/desktop/ exists in the repository"
)
return False
logger.info(f"Using persistent test environment: {self.test_root}")
# Verify we can write to the directory
test_file = self.test_root / ".mcpbench_test"
test_file.write_text("test")
test_file.unlink()
return True
except Exception as e:
logger.error(f"Failed to initialize filesystem environment: {e}")
return False
def set_up(self, task: BaseTask) -> bool:
"""
Set up filesystem environment for a specific task.
Creates a backup of the current environment, then uses the backup
as the working directory to keep the original unchanged.
Args:
task: The task for which to set up the state
Returns:
bool: True if setup successful
"""
try:
# Dynamically set test root based on task category
self._set_dynamic_test_root(task)
# Create backup of current test environment before task execution
if self.backup_enabled:
if not self._create_backup(task):
logger.error(f"Failed to create backup for task {task.name}")
return False
# Use the backup directory as the working directory instead of the original
self.current_task_dir = (
self.backup_dir
) # Use backup directory for operations
logger.info(
f"| ✓ Using the backup environment for operations"
)
# Store the test directory path in the task object for use by task manager
if hasattr(task, "__dict__"):
task.test_directory = str(self.current_task_dir)
# Set environment variable for verification scripts and MCP server
os.environ["FILESYSTEM_TEST_DIR"] = str(self.current_task_dir)
return True
except Exception as e:
logger.error(f"Failed to set up filesystem state for {task.name}: {e}")
return False
def _set_dynamic_test_root(self, task: BaseTask) -> None:
"""
Dynamically set the test root directory based on the task category.
Args:
task: The task for which to set the test root
"""
# Get the base test environments directory from environment variable
base_test_root = os.getenv("FILESYSTEM_TEST_ROOT")
if not base_test_root:
# Fallback to default path
project_root = self._get_project_root()
base_test_root = str(project_root / "test_environments")
base_test_path = Path(base_test_root)
# If task has a category_id, append it to the base path
if task.category_id:
self.test_root = base_test_path / task.category_id
# Store the current task category for URL selection
self._current_task_category = task.category_id
logger.info(f"| ✓ Setting test root to category-specific directory: {self.test_root}")
else:
# Use the base test environments directory
self.test_root = base_test_path
# For base directory, use 'desktop' as default category
self._current_task_category = 'desktop'
logger.info(f"| Setting test root to base directory: {self.test_root}")
# Ensure the directory exists by downloading and extracting if needed
if not self.test_root.exists():
logger.warning(f"| Test directory does not exist: {self.test_root}")
if not self._download_and_extract_test_environment():
logger.error(f"Failed to download and extract test environment for: {self.test_root}")
raise RuntimeError(f"Test environment not available: {self.test_root}")
logger.info(f"| Downloaded and extracted test environment: {self.test_root}")
def clean_up(self, task: Optional[BaseTask] = None, **kwargs) -> bool:
"""
Clean up filesystem resources created during task execution.
Since we operate on the backup directory, we just need to clean up the backup.
Args:
task: The task to clean up after (optional)
**kwargs: Additional cleanup options
Returns:
bool: True if cleanup successful
"""
try:
cleanup_success = True
# Clean up the backup directory since we operated on it
if self.backup_enabled and self.backup_dir and self.backup_dir.exists():
try:
shutil.rmtree(self.backup_dir)
logger.info(
f"| ✓ Cleaned up backup directory for task {task.name if task else 'unknown'}"
)
self.backup_dir = None
except Exception as e:
logger.error(f"Failed to clean up backup directory: {e}")
cleanup_success = False
else:
logger.info("No backup directory to clean up")
# Clear the resources list
self.created_resources.clear()
return cleanup_success
except Exception as e:
logger.error(f"Filesystem cleanup failed: {e}")
return False
def get_test_directory(self) -> Optional[Path]:
"""
Get the current test directory path.
Returns:
Path to the current test directory, or None if not set up
"""
return self.current_task_dir
def get_service_config_for_agent(self) -> dict:
"""
Get service-specific configuration for agent execution.
Returns:
Dictionary containing configuration needed by the agent/MCP server
"""
service_config = {}
# Add test directory if available
if self.current_task_dir:
service_config["test_directory"] = str(self.current_task_dir)
return service_config
def track_resource(self, resource_path: Path):
"""
Track a resource for cleanup.
Args:
resource_path: Path to the resource to track
"""
if resource_path not in self.created_resources:
self.created_resources.append(resource_path)
logger.debug(f"Tracking resource for cleanup: {resource_path}")
def reset_test_environment(self) -> bool:
"""
Reset the test environment to its original state.
This method can be used for development/debugging purposes.
In normal operation, the persistent environment is maintained.
Returns:
bool: True if reset successful
"""
try:
# Remove any sorting directories that might have been created
sorting_dirs = ["has_test", "no_test", "organized", "backup"]
for dir_name in sorting_dirs:
dir_path = self.test_root / dir_name
if dir_path.exists():
shutil.rmtree(dir_path)
logger.info(f"Removed sorting directory: {dir_path}")
# Remove any temporary files that might have been created
temp_files = ["hello_world.txt", "new_file.txt", "temp.txt"]
for file_name in temp_files:
file_path = self.test_root / file_name
if file_path.exists():
file_path.unlink()
logger.info(f"Removed temporary file: {file_path}")
logger.info("Test environment reset completed")
return True
except Exception as e:
logger.error(f"Test environment reset failed: {e}")
return False
# =========================================================================
# Backup and Restore Methods for Task Isolation
# =========================================================================
def _create_backup(self, task: BaseTask) -> bool:
"""
Create a complete backup of the test environment before task execution.
Args:
task: The task for which to create backup
Returns:
bool: True if backup successful
"""
try:
# Create backup directory with task-specific name
project_root = self._get_project_root()
backup_root = (project_root / ".mcpmark_backups").resolve()
backup_root.mkdir(exist_ok=True)
task_id = f"{task.service}_{task.category_id}_{task.task_id}"
self.backup_dir = backup_root / f"backup_{task_id}_{os.getpid()}"
# Remove existing backup if it exists
if self.backup_dir.exists():
shutil.rmtree(self.backup_dir)
# Create fresh backup by copying entire test environment
shutil.copytree(self.test_root, self.backup_dir)
logger.info(f"| ✓ Created backup for task {task.name}: {self.backup_dir}")
return True
except Exception as e:
logger.error(f"Failed to create backup for task {task.name}: {e}")
return False
def _restore_from_backup(self, task: Optional[BaseTask] = None) -> bool:
"""
Restore the test environment from backup.
Args:
task: The task to restore after (optional, for logging)
Returns:
bool: True if restore successful
"""
try:
if not self.backup_dir or not self.backup_dir.exists():
logger.error("No backup directory available for restore")
return False
# Remove current test environment
if self.test_root.exists():
shutil.rmtree(self.test_root)
# Restore from backup
shutil.copytree(self.backup_dir, self.test_root)
# Clean up backup directory
shutil.rmtree(self.backup_dir)
self.backup_dir = None
task_name = task.name if task else "unknown"
logger.info(
f"✅ Restored test environment from backup after task {task_name}"
)
return True
except Exception as e:
task_name = task.name if task else "unknown"
logger.error(f"Failed to restore from backup after task {task_name}: {e}")
return False
# =========================================================================
# Abstract Method Implementations Required by BaseStateManager
# =========================================================================
def _create_initial_state(self, task: BaseTask) -> Optional[Dict[str, Any]]:
"""Create initial state for a task.
For filesystem, this is handled in set_up() method by creating task directories.
Returns the task directory path as state info.
"""
if self.current_task_dir and self.current_task_dir.exists():
return {"task_directory": str(self.current_task_dir)}
return None
def _store_initial_state_info(
self, task: BaseTask, state_info: Dict[str, Any]
) -> None:
"""Store initial state information in the task object.
For filesystem, we store the test directory path.
"""
if state_info and "task_directory" in state_info:
if hasattr(task, "__dict__"):
task.test_directory = state_info["task_directory"]
def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
"""Clean up initial state for a specific task.
For filesystem, this means removing the task directory.
"""
if hasattr(task, "test_directory") and task.test_directory:
task_dir = Path(task.test_directory)
if task_dir.exists():
try:
shutil.rmtree(task_dir)
logger.info(f"Cleaned up task directory: {task_dir}")
return True
except Exception as e:
logger.error(f"Failed to clean up task directory: {e}")
return False
return True
def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
"""Clean up a single tracked resource.
For filesystem, resources are paths to files/directories.
"""
if "path" in resource:
resource_path = Path(resource["path"])
if resource_path.exists():
try:
if resource_path.is_dir():
shutil.rmtree(resource_path)
else:
resource_path.unlink()
logger.info(f"Cleaned up resource: {resource_path}")
return True
except Exception as e:
logger.error(f"Failed to clean up {resource_path}: {e}")
return False
return True
def _download_and_extract_test_environment(self) -> bool:
"""
Download and extract test environment using wget and unzip commands.
This approach preserves original file timestamps and is simpler than Python zipfile.
Returns:
bool: True if download and extraction successful
"""
try:
import subprocess
import sys
# Define URL mapping for different test environment categories
url_mapping = {
'desktop': 'https://storage.mcpmark.ai/filesystem/desktop.zip',
'file_context': 'https://storage.mcpmark.ai/filesystem/file_context.zip',
'file_property': 'https://storage.mcpmark.ai/filesystem/file_property.zip',
'folder_structure': 'https://storage.mcpmark.ai/filesystem/folder_structure.zip',
'papers': 'https://storage.mcpmark.ai/filesystem/papers.zip',
'student_database': 'https://storage.mcpmark.ai/filesystem/student_database.zip',
'threestudio': 'https://storage.mcpmark.ai/filesystem/threestudio.zip',
'votenet': 'https://storage.mcpmark.ai/filesystem/votenet.zip',
'legal_document': 'https://storage.mcpmark.ai/filesystem/legal_document.zip',
'desktop_template': 'https://storage.mcpmark.ai/filesystem/desktop_template.zip'
}
# Get the category from the current task context
category = getattr(self, '_current_task_category', None)
if not category:
logger.error("| No task category available for URL selection")
return False
# Select the appropriate URL based on category
if category in url_mapping:
test_env_url = url_mapping[category]
logger.info(f"| ○ Selected URL for category '{category}': {test_env_url}")
else:
logger.error(f"| No URL mapping found for category: {category}")
return False
# Allow override via environment variable
test_env_url = os.getenv('TEST_ENVIRONMENT_URL', test_env_url)
logger.info(f"| ○ Downloading test environment from: {test_env_url}")
# Create a temporary directory for the download
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
zip_path = temp_path / "test_environment.zip"
# Step 1: Download using wget
logger.info("| ○ Downloading test environment zip file...")
try:
# Use wget if available, otherwise fall back to curl
if sys.platform == "win32":
# Windows: try wget, fall back to curl
try:
result = subprocess.run(
["wget", "-O", str(zip_path), test_env_url],
capture_output=True, text=True, check=True
)
except (subprocess.CalledProcessError, FileNotFoundError):
# Fall back to curl
result = subprocess.run(
["curl", "-L", "-o", str(zip_path), test_env_url],
capture_output=True, text=True, check=True
)
else:
# Unix-like systems: try wget, fall back to curl
try:
result = subprocess.run(
["wget", "-O", str(zip_path), test_env_url],
capture_output=True, text=True, check=True
)
except (subprocess.CalledProcessError, FileNotFoundError):
# Fall back to curl
result = subprocess.run(
["curl", "-L", "-o", str(zip_path), test_env_url],
capture_output=True, text=True, check=True
)
logger.info("| ✓ Download completed successfully")
except Exception as e:
logger.error(f"| Download failed: {e}")
return False
# Step 2: Extract using unzip
logger.info("| ○ Extracting test environment...")
try:
# Extract to parent directory to maintain expected structure
result = subprocess.run(
["unzip", "-o", str(zip_path), "-d", str(self.test_root.parent)],
capture_output=True, text=True, check=True
)
logger.info("| ✓ Extraction completed successfully")
except Exception as e:
logger.error(f"| Extraction failed: {e}")
return False
# Step 3: Remove __MACOSX folder if it exists
logger.info("| ○ Cleaning up macOS metadata...")
macosx_path = self.test_root.parent / "__MACOSX"
if macosx_path.exists():
try:
shutil.rmtree(macosx_path)
logger.info("| ✓ Removed __MACOSX folder")
except Exception as e:
logger.warning(f"| Failed to remove __MACOSX folder: {e}")
# Verify the extracted directory exists
if not self.test_root.exists():
logger.error(f"| Extracted directory not found at expected path: {self.test_root}")
return False
logger.info(f"| ✓ Successfully downloaded and extracted test environment to: {self.test_root}")
return True
except Exception as e:
logger.error(f"| Failed to download and extract test environment: {e}")
return False
================================================
FILE: src/mcp_services/filesystem/filesystem_task_manager.py
================================================
"""
Simplified Filesystem Task Manager using Enhanced Base Class
============================================================
This module shows how the filesystem task manager can be simplified
using the enhanced base task manager.
"""
import os
import subprocess
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Dict, Any
from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger
logger = get_logger(__name__)
@dataclass
class FilesystemTask(BaseTask):
"""Filesystem-specific task with additional fields."""
test_directory: Optional[str] = None
expected_files: Optional[List[str]] = None
expected_directories: Optional[List[str]] = None
class FilesystemTaskManager(BaseTaskManager):
"""Simplified filesystem task manager using enhanced base class."""
def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
"""Initialize filesystem task manager."""
if tasks_root is None:
tasks_root = Path(__file__).resolve().parents[3] / "tasks"
super().__init__(
tasks_root,
mcp_service="filesystem",
task_class=FilesystemTask,
task_organization="directory",
task_suite=task_suite,
)
# Override only what's needed for filesystem-specific behavior
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> BaseTask:
"""Instantiate a `BaseTask` from the dictionary returned by `_find_task_files`."""
import json
# Support arbitrary task names, not just task_n format
task_name = task_files_info["task_id"]
# Use task_name as default task_id
task_id = task_name
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
final_category_id = category_id
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
return self.task_class(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="filesystem",
category_id=final_category_id,
task_id=task_id,
)
def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
"""Run verification with filesystem-specific environment."""
env = os.environ.copy()
# Pass test directory to verification script
# Priority: task.test_directory (set by state manager) > environment variable
test_dir = None
if hasattr(task, "test_directory") and task.test_directory:
test_dir = task.test_directory
else:
test_dir = os.getenv("FILESYSTEM_TEST_DIR")
if test_dir:
env["FILESYSTEM_TEST_DIR"] = test_dir
logger.debug(f"Setting FILESYSTEM_TEST_DIR to: {test_dir}")
return subprocess.run(
self._get_verification_command(task),
capture_output=True,
text=True,
timeout=300,
env=env,
)
def filter_tasks(self, task_filter: str) -> List[BaseTask]:
"""Filter tasks based on category or specific task pattern with support for arbitrary task names."""
all_tasks = self.discover_all_tasks()
if not task_filter or task_filter.lower() == "all":
return all_tasks
# Check if it's a category filter
categories = self.get_categories()
if task_filter in categories:
return [task for task in all_tasks if task.category_id == task_filter]
# Check for specific task pattern (category_id/task_X or category_id/arbitrary_name)
if "/" in task_filter:
try:
category_id, task_id = task_filter.split("/", 1)
# Direct string matching for task_id
for task in all_tasks:
if task.category_id == category_id and str(task.task_id) == task_id:
return [task]
except (ValueError, IndexError):
pass
# Fallback: check for partial matches in task names or categories
filtered_tasks = []
for task in all_tasks:
if (
task_filter in task.category_id
or task_filter in task.name
or task_filter == str(task.task_id)
):
filtered_tasks.append(task)
return filtered_tasks
================================================
FILE: src/mcp_services/github/__init__.py
================================================
"""
GitHub MCP Service for MCPMark
===============================
This module provides GitHub-specific MCP server integration for MCPMark evaluation.
Uses GitHub's official remote MCP server for streamable HTTP/SSE communication.
Updated to include initial state-based environment replication mechanism.
"""
from .github_login_helper import GitHubLoginHelper
from .github_task_manager import GitHubTaskManager, GitHubTask
from .github_state_manager import GitHubStateManager
__all__ = ["GitHubLoginHelper", "GitHubTaskManager", "GitHubTask", "GitHubStateManager"]
================================================
FILE: src/mcp_services/github/github_login_helper.py
================================================
"""
GitHub Login Helper for MCPMark
================================
This module provides GitHub token authentication and validation utilities.
Unlike browser-based services, GitHub uses token-based authentication.
"""
import json
import requests
from pathlib import Path
from typing import Optional, Dict, Any
from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger
logger = get_logger(__name__)
class GitHubLoginHelper(BaseLoginHelper):
"""
Utility helper for GitHub token authentication and validation.
"""
def __init__(
self,
token: Optional[str] = None,
state_path: Optional[Path] = None,
) -> None:
"""
Initialize the GitHub login helper.
Args:
token: GitHub Personal Access Token
state_path: Path to save authentication state
"""
self.token = token
self.state_path = state_path or Path.home() / ".mcpmark" / "github_auth.json"
# Ensure state directory exists
self.state_path.parent.mkdir(parents=True, exist_ok=True)
def login_and_save_state(self, **kwargs) -> bool:
"""
Validate GitHub token and save authentication state.
Returns:
bool: True if authentication successful, False otherwise
"""
if not self.token:
logger.error("No GitHub token provided")
return False
try:
# Validate token by making an authenticated request
session = requests.Session()
session.headers.update(
{
"Authorization": f"Bearer {self.token}",
"Accept": "application/vnd.github.v3+json",
"X-GitHub-Api-Version": "2022-11-28",
"User-Agent": "MCPMark/1.0",
}
)
# Get user information
response = session.get("https://api.github.com/user")
if response.status_code != 200:
logger.error(
f"GitHub authentication failed: {response.status_code} {response.text}"
)
return False
user_info = response.json()
logger.info(
f"GitHub authentication successful for user: {user_info['login']}"
)
# Get token scopes
token_scopes = self._get_token_scopes(session)
# Save authentication state
auth_state = {
"user": user_info,
"token_scopes": token_scopes,
"authenticated_at": self._get_current_timestamp(),
}
self._save_auth_state(auth_state)
# Verify required permissions
if not self._verify_required_permissions(token_scopes):
logger.warning("GitHub token may not have all required permissions")
return False
return True
except Exception as e:
logger.error(f"GitHub authentication error: {e}")
return False
def _get_token_scopes(self, session: requests.Session) -> list:
"""Get the scopes available to the current token."""
try:
response = session.get("https://api.github.com/user")
scopes_header = response.headers.get("X-OAuth-Scopes", "")
if scopes_header:
return [
scope.strip() for scope in scopes_header.split(",") if scope.strip()
]
return []
except Exception as e:
logger.warning(f"Could not determine token scopes: {e}")
return []
def _verify_required_permissions(self, scopes: list) -> bool:
"""
Verify that the token has the minimum required permissions.
For MCPMark GitHub tasks, we typically need:
- repo (for repository access)
- read:user (for user information)
"""
required_scopes = ["repo"] # Minimum requirement
recommended_scopes = ["repo", "read:user", "read:org"]
has_required = all(scope in scopes for scope in required_scopes)
if not has_required:
logger.error(
f"Token missing required scopes. Required: {required_scopes}, Available: {scopes}"
)
return False
has_recommended = all(scope in scopes for scope in recommended_scopes)
if not has_recommended:
logger.warning(
f"Token missing some recommended scopes. Recommended: {recommended_scopes}, Available: {scopes}"
)
return True
def _save_auth_state(self, auth_state: Dict[str, Any]):
"""Save authentication state to local file."""
try:
with open(self.state_path, "w") as f:
json.dump(auth_state, f, indent=2, default=str)
# Set restrictive permissions (user read/write only)
self.state_path.chmod(0o600)
logger.info(f"Authentication state saved to: {self.state_path}")
except Exception as e:
logger.error(f"Failed to save authentication state: {e}")
def _get_current_timestamp(self) -> str:
"""Get current timestamp in ISO format."""
from datetime import datetime
return datetime.utcnow().isoformat() + "Z"
def get_saved_auth_state(self) -> Optional[Dict[str, Any]]:
"""Load and return saved authentication state."""
try:
if self.state_path.exists():
with open(self.state_path, "r") as f:
return json.load(f)
except Exception as e:
logger.error(f"Failed to load authentication state: {e}")
return None
def is_token_valid(self) -> bool:
"""Check if the current token is still valid."""
if not self.token:
return False
try:
session = requests.Session()
session.headers.update(
{
"Authorization": f"Bearer {self.token}",
"Accept": "application/vnd.github.v3+json",
}
)
response = session.get("https://api.github.com/user")
return response.status_code == 200
except Exception:
return False
def get_rate_limit_info(self) -> Dict[str, Any]:
"""Get current rate limit information for the token."""
if not self.token:
return {}
try:
session = requests.Session()
session.headers.update(
{
"Authorization": f"Bearer {self.token}",
"Accept": "application/vnd.github.v3+json",
}
)
response = session.get("https://api.github.com/rate_limit")
if response.status_code == 200:
return response.json()
except Exception as e:
logger.warning(f"Failed to get rate limit info: {e}")
return {}
def test_repository_access(self, owner: str, repo: str) -> bool:
"""Test if the token has access to a specific repository."""
if not self.token:
return False
try:
session = requests.Session()
session.headers.update(
{
"Authorization": f"Bearer {self.token}",
"Accept": "application/vnd.github.v3+json",
}
)
response = session.get(f"https://api.github.com/repos/{owner}/{repo}")
return response.status_code == 200
except Exception:
return False
================================================
FILE: src/mcp_services/github/github_state_manager.py
================================================
"""
GitHub State Manager for MCPMark
=================================
This module handles GitHub repository state management for consistent task evaluation.
Manages test repositories, branches, and cleanup after evaluation.
"""
import requests
from typing import Optional, List, Union
from pathlib import Path
from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
from src.mcp_services.github.token_pool import GitHubTokenPool
logger = get_logger(__name__)
class GitHubStateManager(BaseStateManager):
"""
Manages GitHub repository state for task evaluation.
"""
def __init__(
self,
github_token: Union[str, List[str]],
# Name of the evaluation organisation / user where temporary test repositories are created
eval_org: str = "mcpmark-eval",
# Local directory that stores *exported* repository templates (produced by repo_exporter.py)
templates_root: str = "./github_state",
):
"""
Initialize GitHub state manager.
Args:
github_token: GitHub Personal Access Token(s). Can be a single token string or a list of tokens for round-robin usage.
eval_org: Organisation / user used to host **ephemeral evaluation repositories**.
"""
super().__init__(service_name="github")
# Track repos created via template import so we can delete them afterwards
self._repos_to_cleanup: list[tuple[str, str]] = [] # (owner, repo_name)
# Initialize token pool
if isinstance(github_token, str):
# Single token - create pool with one token
self.token_pool = GitHubTokenPool([github_token])
self.github_token = github_token # Keep for backward compatibility
else:
# Multiple tokens - use token pool
self.token_pool = GitHubTokenPool(github_token)
self.github_token = (
self.token_pool.get_current_token()
) # For backward compatibility
# Store evaluation context (consistent naming)
self.eval_org = eval_org # evaluation organisation / user
# Local path that contains exported repository templates
self.templates_root = Path(templates_root).expanduser().resolve()
# Set up HTTP session for GitHub API
self.session = requests.Session()
# Note: We'll update the Authorization header before each request
self.session.headers.update(
{
"Accept": "application/vnd.github.v3+json",
"X-GitHub-Api-Version": "2022-11-28",
"User-Agent": "MCPMark/1.0",
}
)
# Validate GitHub configuration during initialization
try:
# Set initial token for validation
self._update_session_token()
response = self.session.get("https://api.github.com/user")
if response.status_code != 200:
raise ValueError(
f"Invalid GitHub token: {response.status_code} {response.text}"
)
user_info = response.json()
logger.info(f"GitHub authenticated as: {user_info['login']}")
logger.info(f"Using token pool with {self.token_pool.pool_size} token(s)")
# Check if evaluation organisation exists (optional)
if self.eval_org:
org_response = self.session.get(
f"https://api.github.com/orgs/{self.eval_org}"
)
if org_response.status_code == 200:
logger.info(f"Using evaluation organisation: {self.eval_org}")
else:
logger.warning(
f"Evaluation organisation {self.eval_org} not accessible, using user account"
)
# Fall back to user account
self.eval_org = user_info["login"]
logger.info("GitHub state manager initialized successfully")
except Exception as e:
raise RuntimeError(f"GitHub initialization failed: {e}")
# Initial state mapping - categories to initial state repositories
self.initial_state_mapping = {
"build_your_own_x": "codecrafters-io-build-your-own-x",
"missing-semester": "missing-semester-missing-semester",
"mcpmark-cicd": "zjwu0522-mcpmark-cicd",
"harmony": "openai-harmony",
"claude-code": "anthropics-claude-code",
"easyr1": "hiyouga-EasyR1",
}
# CDN URL mapping for downloading GitHub templates
self.github_template_url_mapping = {
"codecrafters-io-build-your-own-x": "https://storage.mcpmark.ai/github/codecrafters-io-build-your-own-x.zip",
"missing-semester-missing-semester": "https://storage.mcpmark.ai/github/missing-semester-missing-semester.zip",
"zjwu0522-mcpmark-cicd": "https://storage.mcpmark.ai/github/zjwu0522-mcpmark-cicd.zip",
"openai-harmony": "https://storage.mcpmark.ai/github/openai-harmony.zip",
"anthropics-claude-code": "https://storage.mcpmark.ai/github/anthropics-claude-code.zip",
"hiyouga-EasyR1": "https://storage.mcpmark.ai/github/hiyouga-EasyR1.zip",
}
# =========================================================================
# Core Template Methods (Required by BaseStateManager)
# =========================================================================
# ---------------------------------------------------------------------
# Internal helper – template importer (replicates repo_importer logic)
# ---------------------------------------------------------------------
def _import_template_repo(
self, template_dir: Path, owner: str, private: bool = True
) -> str:
"""Import repository from local template directory to GitHub (simplified)."""
import json
import subprocess
import time
# ------------------------------------------------------------------
# Helper functions (stripped-down versions of repo_importer utilities)
# ------------------------------------------------------------------
def _list_refs(repo_dir: str) -> list[str]:
result = subprocess.run(
["git", "-C", repo_dir, "for-each-ref", "--format=%(refname)"],
check=True,
capture_output=True,
text=True,
)
return result.stdout.strip().splitlines()
def _push_repo(
repo_path: Path, repo_owner: str, repo_name: str, required_refs: list[str]
):
"""Push repo to GitHub: try mirror, else per-ref."""
token = self.github_token
dst_url = f"https://x-access-token:{token}@github.com/{repo_owner}/{repo_name}.git"
try:
subprocess.run(
["git", "-C", str(repo_path), "push", "--mirror", dst_url],
check=True,
capture_output=True,
)
return
except subprocess.CalledProcessError as err:
logger.warning(
"| [push] Mirror push failed – falling back: %s",
err.stderr.decode(errors="ignore"),
)
refs = required_refs or _list_refs(str(repo_path))
for ref in refs:
for attempt in range(3):
try:
subprocess.run(
[
"git",
"-C",
str(repo_path),
"push",
dst_url,
f"{ref}:{ref}",
],
check=True,
capture_output=True,
)
break
except subprocess.CalledProcessError as ref_err:
if attempt == 2:
raise RuntimeError(
f"Failed to push ref {ref}: {ref_err.stderr}"
) from ref_err
time.sleep(2 * (attempt + 1))
# ------------------------------------------------------------------
# Phase 0 – read template metadata
# ------------------------------------------------------------------
meta = json.loads((template_dir / "meta.json").read_text())
repo_name: str = meta["repo"]
pr_head_refs = meta.get("pr_head_refs", [])
default_branch = meta.get("default_branch", "main")
pulls_data = json.loads((template_dir / "pulls.json").read_text())
fork_branches = [
pr["local_branch"]
for pr in pulls_data
if pr.get("is_from_fork") and "local_branch" in pr
]
needed_refs = (
[f"refs/heads/{default_branch}"]
+ [f"refs/heads/{h}" for h in pr_head_refs]
+ [f"refs/heads/{b}" for b in fork_branches]
)
# ------------------------------------------------------------------
# Phase 1 – create empty repo under owner
# ------------------------------------------------------------------
create_payload = {
"name": repo_name,
"description": f"Restored template repo {repo_name}",
"private": private,
"auto_init": False,
"has_issues": True,
"has_projects": True,
"has_wiki": False,
"default_branch": default_branch, # Set the correct default branch
}
auth_user = self._get_authenticated_user()
create_url = (
"https://api.github.com/user/repos"
if owner == auth_user
else f"https://api.github.com/orgs/{owner}/repos"
)
resp = self._request_with_retry("POST", create_url, json=create_payload)
if resp.status_code == 422 and "name already exists" in resp.text:
# Attempt to delete and recreate
self._delete_repository(owner, repo_name)
resp = self._request_with_retry("POST", create_url, json=create_payload)
if resp.status_code not in (200, 201):
raise RuntimeError(f"Failed to create repo: {resp.status_code} {resp.text}")
html_url = resp.json()["html_url"]
logger.info("| [import] Target repository created: %s", html_url)
# Safety check: Prevent importing to public repositories
# Public repos would send @ mention notifications to real users, causing spam
if not private:
error_msg = (
"ERROR: Cannot import template to a public repository.\n\n"
"Reason: The template contains @ mentions of real GitHub users from the original\n"
"repository. Importing to a public repository would send notifications to these\n"
"users, which is disruptive and inappropriate.\n\n"
"Solution: Set private=True when calling _import_template_repo()."
)
logger.error(error_msg)
# Clean up the created repo before raising
self._delete_repository(owner, repo_name)
raise RuntimeError(error_msg)
# Immediately disable GitHub Actions for ALL repositories to prevent any accidental triggers
# We'll re-enable it later only for mcpmark-cicd
logger.info(
"| [import] Disabling GitHub Actions immediately after repo creation..."
)
self._disable_github_actions(owner, repo_name)
# ------------------------------------------------------------------
# Phase 2 – push git history
# ------------------------------------------------------------------
repo_path = template_dir / "repo"
logger.info("| [import] Pushing git history …")
_push_repo(repo_path, owner, repo_name, needed_refs)
# Remove .github directory after pushing with a new commit
import shutil
github_dir = repo_path / ".github"
if github_dir.exists():
logger.info("| [import] Removing .github directory after push …")
shutil.rmtree(github_dir)
# Commit the deletion
subprocess.run(
["git", "-C", str(repo_path), "add", "-A"],
check=True,
capture_output=True,
)
subprocess.run(
[
"git",
"-C",
str(repo_path),
"commit",
"-m",
"Remove .github directory",
],
capture_output=True,
)
# Push the new commit
token = self.github_token
dst_url = (
f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git"
)
subprocess.run(
["git", "-C", str(repo_path), "push", dst_url],
check=True,
capture_output=True,
)
# ------------------------------------------------------------------
# Phase 3 – recreate issues & PRs
# ------------------------------------------------------------------
def _create_comment(issue_number: int, body: str):
self._request_with_retry(
"POST",
f"https://api.github.com/repos/{owner}/{repo_name}/issues/{issue_number}/comments",
json={"body": body},
)
def _create_issue(item: dict) -> Optional[int]:
data = {
"title": item["title"],
"body": self._obfuscate_mentions(item.get("body", "")),
"labels": item.get("labels", []),
}
r = self._request_with_retry(
"POST",
f"https://api.github.com/repos/{owner}/{repo_name}/issues",
json=data,
)
if r.status_code not in (200, 201):
return None
new_no = r.json()["number"]
if item.get("state") == "closed":
self._request_with_retry(
"PATCH",
f"https://api.github.com/repos/{owner}/{repo_name}/issues/{new_no}",
json={"state": "closed"},
)
return new_no
def _create_pull(pr_itm: dict) -> Optional[int]:
body = self._obfuscate_mentions(pr_itm.get("body", ""))
if pr_itm.get("is_from_fork", False):
fork_note = f"\n\n---\n_This PR was originally from a fork: **{pr_itm.get('fork_owner')}/{pr_itm.get('fork_repo')}** (branch: `{pr_itm['head']}`)_"
body = body + fork_note if body else fork_note[2:]
payload = {
"title": pr_itm["title"],
"body": body,
"head": pr_itm.get("local_branch", pr_itm["head"]),
"base": pr_itm["base"],
}
r = self._request_with_retry(
"POST",
f"https://api.github.com/repos/{owner}/{repo_name}/pulls",
json=payload,
)
if r.status_code not in (200, 201):
return None
return r.json()["number"]
# Issues
issues_data = json.loads((template_dir / "issues.json").read_text())
created_issues = 0
logger.info("| [phase] Re-creating issues …")
for itm in issues_data:
new_no = _create_issue(itm)
if new_no:
created_issues += 1
for c in itm.get("comments", []):
_create_comment(
new_no,
self._obfuscate_mentions(
f"*Original author: @{c['user']}*\n\n{c['body']}"
),
)
logger.info(
"| [phase] Created %d out of %d issues", created_issues, len(issues_data)
)
# Pull requests
logger.info("| [phase] Re-creating pull requests …")
created_prs = 0
skipped_prs = 0
for pr in pulls_data:
new_pr_no = _create_pull(pr)
if new_pr_no:
created_prs += 1
for c in pr.get("comments", []):
_create_comment(
new_pr_no,
self._obfuscate_mentions(
f"*Original author: @{c['user']}*\n\n{c['body']}"
),
)
for rc in pr.get("review_comments", []):
_create_comment(
new_pr_no,
self._obfuscate_mentions(
f"*Original author: @{rc['user']}* (review)\n\n{rc['body']}"
),
)
else:
skipped_prs += 1
logger.info(
"| [phase] Created %d PRs, skipped %d PRs", created_prs, skipped_prs
)
# Re-enable GitHub Actions ONLY for mcpmark-cicd repository
# All other repos remain disabled (as set immediately after creation)
if "mcpmark-cicd" in template_dir.name:
logger.info("| [import] Re-enabling GitHub Actions for CI/CD repository…")
self._enable_github_actions(owner, repo_name)
# Disable notifications to prevent email spam
logger.info("| [import] Disabling repository notifications …")
self._disable_repository_notifications(owner, repo_name)
logger.info("| [import] Repository import complete: %s", html_url)
return html_url
# ---------------------------------------------------------------------
# Public – create initial state using local template import
# ---------------------------------------------------------------------
def _create_initial_state(self, task: "BaseTask") -> Optional[InitialStateInfo]:
"""
Set up GitHub environment for a specific task.
This may involve:
1. Creating/forking test repositories
2. Setting up branches
3. Creating issues or PRs if needed
"""
try:
logger.info(f"| Setting up GitHub state for task: {task.name}")
template_name = self.select_initial_state_for_task(task.category_id)
if template_name is None:
raise RuntimeError(
f"No template configured for task category: {task.category_id}"
)
template_dir = (self.templates_root / template_name).resolve()
if not template_dir.exists():
logger.warning(
"| Template directory %s not found locally, attempting to download from CDN",
template_dir,
)
if not self._download_and_extract_github_template(template_name):
logger.error(
"| Failed to download template %s from CDN", template_name
)
return None
logger.info("| Template %s downloaded successfully", template_name)
logger.info(f"| Importing repository template from {template_dir} …")
owner = self.eval_org if self.eval_org else self._get_authenticated_user()
if "mcpmark-cicd" in template_name:
repo_url = self._import_template_repo(template_dir, owner, False)
else:
repo_url = self._import_template_repo(template_dir, owner, True)
# Record for cleanup later
repo_name = repo_url.rstrip("/").split("/")[-1]
self._repos_to_cleanup.append((owner, repo_name))
# Build InitialStateInfo
return InitialStateInfo(
state_id=f"{owner}/{repo_name}",
state_url=repo_url,
metadata={
"owner": owner,
"repo_name": repo_name,
"category": task.category_id,
"task_id": task.task_id,
},
)
except Exception as e:
logger.error(f"| GitHub setup failed for {task.name}: {e}")
return None
# ---------------------------------------------------------------------
# BaseStateManager required hooks
# ---------------------------------------------------------------------
def _store_initial_state_info(self, task, state_info: InitialStateInfo) -> None: # type: ignore[override]
if hasattr(task, "repository_url"):
task.repository_url = state_info.state_url
def _cleanup_task_initial_state(self, task) -> bool: # type: ignore[override]
"""No-op – cleanup is handled by self.clean_up which deletes imported repos."""
return True
def _cleanup_single_resource(self, resource) -> bool: # type: ignore[override]
"""No-op – we don't use BaseStateManager's tracked_resources anymore."""
return True
# ---------------------------------------------------------------------
def clean_up(self, task=None, **kwargs) -> bool:
"""Delete repositories that were imported for tasks."""
success = True
for owner, repo_name in self._repos_to_cleanup:
try:
self._delete_repository(owner, repo_name)
logger.info("| Deleted repository: %s/%s", owner, repo_name)
except Exception as err:
logger.error(
"| Failed to delete repository %s/%s: %s", owner, repo_name, err
)
success = False
self._repos_to_cleanup.clear()
return success
# =========================================================================
# Repository Creation and Setup Operations
# =========================================================================
def _delete_repository(self, owner: str, repo_name: str):
"""Delete a repository (use with caution)."""
delete_url = f"https://api.github.com/repos/{owner}/{repo_name}"
response = self.session.delete(delete_url)
if response.status_code not in [200, 204]:
logger.warning(
f"| Failed to delete repository {owner}/{repo_name}: {response.text}"
)
raise Exception(
f"| Failed to delete repository {owner}/{repo_name}: {response.status_code} {response.text}"
)
else:
logger.info(f"| Successfully deleted repository {owner}/{repo_name}")
def _obfuscate_mentions(self, text: str) -> str:
"""
Obfuscate @ mentions to prevent notifications to real users.
Replaces @username with @username_XXXX (random suffix) to ensure the mentioned
user does not exist on GitHub. This prevents notification spam when importing
templates that contain @ mentions from original repositories.
Args:
text: The text content that may contain @ mentions
Returns:
Text with obfuscated @ mentions
"""
import re
import random
import string
if not text:
return text
# Pattern matches @username (GitHub usernames: alphanumeric, hyphens, max 39 chars)
# Negative lookbehind (? str:
"""Return cached authenticated username or fetch once from GitHub."""
if hasattr(self, "_auth_user") and self._auth_user:
return self._auth_user
response = self.session.get("https://api.github.com/user")
if response.status_code == 200:
self._auth_user = response.json()["login"]
return self._auth_user
return None
# ---------------------------------------------------------------------
# Token management helpers
# ---------------------------------------------------------------------
def _update_session_token(self):
"""Update the session Authorization header with the current token."""
current_token = self.token_pool.get_current_token()
self.session.headers.update({"Authorization": f"Bearer {current_token}"})
# Update backward compatibility attribute
self.github_token = current_token
def _rotate_token(self):
"""Rotate to the next token in the pool and update session."""
next_token = self.token_pool.get_next_token()
self.session.headers.update({"Authorization": f"Bearer {next_token}"})
# Update backward compatibility attribute
self.github_token = next_token
logger.debug(f"| Rotated to next token in pool")
# ---------------------------------------------------------------------
# Generic request helper with rate-limit (403) retry handling
# ---------------------------------------------------------------------
def _request_with_retry(
self,
method: str,
url: str,
*,
max_retries: int = 2,
sleep_seconds: int = 120,
**kwargs,
):
"""Send a GitHub API request with basic rate-limit handling and token rotation.
If a request receives HTTP 403 (rate limit):
1. First try rotating to the next token in the pool
2. If still rate limited, sleep and retry
3. After max_retries are exhausted, raise RuntimeError
"""
import time # local import to avoid adding global dependency
attempt = 0
tokens_tried = 0
while True:
# Ensure we have the current token set
self._update_session_token()
resp = self.session.request(method, url, **kwargs)
# Successful or non-rate-limited response – return immediately
if resp.status_code != 403:
return resp
# 403 – very likely rate-limited
# First, try rotating tokens if we have multiple
if (
self.token_pool.pool_size > 1
and tokens_tried < self.token_pool.pool_size
):
logger.warning(
"| GitHub API rate limit encountered. Rotating to next token (tried %d/%d tokens)",
tokens_tried + 1,
self.token_pool.pool_size,
)
self._rotate_token()
tokens_tried += 1
continue
# All tokens exhausted or single token, resort to sleep/retry
if attempt >= max_retries:
raise RuntimeError(
f"GitHub API rate limited after {attempt + 1} attempts with {self.token_pool.pool_size} token(s): {resp.status_code} {resp.text}"
)
logger.warning(
"| All tokens rate limited (attempt %d/%d). Sleeping %d seconds before retrying …",
attempt + 1,
max_retries + 1,
sleep_seconds,
)
time.sleep(sleep_seconds)
attempt += 1
tokens_tried = 0 # Reset token counter for next attempt
# =========================================================================
# Initial State Selection and Repository Creation
# =========================================================================
# Initial state for each task category is resolved via self.initial_state_mapping
def select_initial_state_for_task(self, task_category: str) -> Optional[str]:
"""Resolve template name for a task category with light normalization."""
if not task_category:
return None
candidate_keys = []
candidate_keys.append(task_category)
# Allow users to swap between hyphen/underscore naming conventions.
hyphen_to_underscore = task_category.replace("-", "_")
if hyphen_to_underscore not in candidate_keys:
candidate_keys.append(hyphen_to_underscore)
underscore_to_hyphen = task_category.replace("_", "-")
if underscore_to_hyphen not in candidate_keys:
candidate_keys.append(underscore_to_hyphen)
for key in candidate_keys:
template = self.initial_state_mapping.get(key)
if template:
if key != task_category:
logger.debug(
"| Resolved GitHub template for %s via alias %s -> %s",
task_category,
key,
template,
)
return template
return None
def extract_repo_info_from_url(self, repo_url: str) -> tuple[str, str]:
"""Extract owner and repo name from GitHub URL."""
try:
from urllib.parse import urlparse
# Support https://github.com/owner/repo format
if "github.com" in repo_url:
path = urlparse(repo_url).path.strip("/")
parts = path.split("/")
if len(parts) >= 2:
return parts[0], parts[1]
raise ValueError(f"Invalid GitHub URL format: {repo_url}")
except Exception as e:
logger.error(f"| Failed to extract repo info from URL {repo_url}: {e}")
raise
def get_service_config_for_agent(self) -> dict:
"""
Get service-specific configuration for agent execution.
Rotates to the next token in the pool before returning config
to distribute load across tokens.
Returns:
Dictionary containing configuration needed by the agent/MCP server
"""
service_config = {}
# Add GitHub token if available
if self.github_token:
service_config["github_token"] = self.github_token
return service_config
def set_verification_environment(self, messages_path: str = None) -> None:
"""
Set GitHub-specific environment variables for verification scripts.
This ensures verification scripts use the same token as the current
agent execution, maintaining consistency across the evaluation flow.
Args:
messages_path: Optional path to messages.json file for verification
"""
import os
# Set common MCP_MESSAGES if provided
if messages_path:
os.environ["MCP_MESSAGES"] = str(messages_path)
# Set GitHub-specific token
current_token = self.token_pool.get_current_token()
os.environ["MCP_GITHUB_TOKEN"] = current_token
logger.info("| Set MCP_GITHUB_TOKEN for verification scripts")
def _enable_github_actions(self, owner: str, repo_name: str):
"""Enable GitHub Actions for the repository using REST API."""
try:
# Enable GitHub Actions
url = (
f"https://api.github.com/repos/{owner}/{repo_name}/actions/permissions"
)
response = self.session.put(
url, json={"enabled": True, "allowed_actions": "all"}
)
if response.status_code in [200, 204]:
logger.info(
"| Successfully enabled GitHub Actions for %s/%s", owner, repo_name
)
else:
logger.warning(
"| Failed to enable GitHub Actions: %s %s",
response.status_code,
response.text,
)
except Exception as e:
logger.error("| Failed to enable GitHub Actions: %s", e)
def _disable_github_actions(self, owner: str, repo_name: str):
"""Disable GitHub Actions for the repository using REST API."""
try:
# Disable GitHub Actions
url = (
f"https://api.github.com/repos/{owner}/{repo_name}/actions/permissions"
)
response = self.session.put(url, json={"enabled": False})
if response.status_code in [200, 204]:
logger.info(
"| Successfully disabled GitHub Actions for %s/%s", owner, repo_name
)
else:
logger.warning(
"| Failed to disable GitHub Actions: %s %s",
response.status_code,
response.text,
)
except Exception as e:
logger.error("| Failed to disable GitHub Actions: %s", e)
def _disable_repository_notifications(self, owner: str, repo_name: str):
"""Disable repository notifications to prevent email spam."""
try:
# Set repository notification subscription to ignore
url = f"https://api.github.com/repos/{owner}/{repo_name}/subscription"
response = self.session.put(
url, json={"subscribed": False, "ignored": True}
)
if response.status_code in [200, 201]:
logger.info(
"| Successfully disabled notifications for %s/%s", owner, repo_name
)
elif response.status_code == 403:
# This is expected if the token doesn't have notifications scope
logger.debug(
"| Cannot disable notifications for %s/%s (token lacks notifications scope - this is OK)",
owner,
repo_name,
)
else:
logger.warning(
"| Failed to disable repository notifications: %s %s",
response.status_code,
response.text,
)
except Exception as e:
logger.error("| Failed to disable repository notifications: %s", e)
def _download_and_extract_github_template(self, template_name: str) -> bool:
"""
Download and extract GitHub template from CDN using wget and unzip commands.
This approach preserves original file timestamps and is simpler than Python zipfile.
Args:
template_name: Name of the template to download (e.g., "anthropics-claude-code")
Returns:
bool: True if download and extraction successful
"""
try:
import subprocess
import sys
import tempfile
import shutil
import os
# Get the URL from mapping
if template_name not in self.github_template_url_mapping:
logger.error(f"| No URL mapping found for template: {template_name}")
return False
template_url = self.github_template_url_mapping[template_name]
# Allow override via environment variable
template_url = os.getenv("GITHUB_TEMPLATE_URL", template_url)
logger.info(f"| ○ Downloading GitHub template from: {template_url}")
# Create a temporary directory for the download
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
zip_path = temp_path / "github_template.zip"
# Step 1: Download using wget/curl
logger.info("| ○ Downloading GitHub template zip file...")
try:
# Use wget if available, otherwise fall back to curl
if sys.platform == "win32":
# Windows: try wget, fall back to curl
try:
result = subprocess.run(
["wget", "-O", str(zip_path), template_url],
capture_output=True,
text=True,
check=True,
)
except (subprocess.CalledProcessError, FileNotFoundError):
# Fall back to curl
result = subprocess.run(
["curl", "-L", "-o", str(zip_path), template_url],
capture_output=True,
text=True,
check=True,
)
else:
# Unix-like systems: try wget, fall back to curl
try:
result = subprocess.run(
["wget", "-O", str(zip_path), template_url],
capture_output=True,
text=True,
check=True,
)
except (subprocess.CalledProcessError, FileNotFoundError):
# Fall back to curl
result = subprocess.run(
["curl", "-L", "-o", str(zip_path), template_url],
capture_output=True,
text=True,
check=True,
)
logger.info("| ✓ Download completed successfully")
except Exception as e:
logger.error(f"| Download failed: {e}")
return False
# Step 2: Extract using unzip
logger.info("| ○ Extracting GitHub template...")
try:
# Extract to templates root directory
result = subprocess.run(
["unzip", "-o", str(zip_path), "-d", str(self.templates_root)],
capture_output=True,
text=True,
check=True,
)
logger.info("| ✓ Extraction completed successfully")
except Exception as e:
logger.error(f"| Extraction failed: {e}")
return False
# Step 3: Remove __MACOSX folder if it exists
macosx_path = self.templates_root / "__MACOSX"
if macosx_path.exists():
logger.info("| ○ Cleaning up macOS metadata...")
try:
shutil.rmtree(macosx_path)
logger.info("| ✓ Removed __MACOSX folder")
except Exception as e:
logger.warning(f"| Failed to remove __MACOSX folder: {e}")
# Verify the extracted template directory exists
template_path = self.templates_root / template_name
if not template_path.exists():
logger.error(
f"| Extracted template directory not found at expected path: {template_path}"
)
return False
logger.info(
f"| ✓ Successfully downloaded and extracted GitHub template to: {template_path}"
)
return True
except Exception as e:
logger.error(f"| Failed to download and extract GitHub template: {e}")
return False
================================================
FILE: src/mcp_services/github/github_task_manager.py
================================================
"""
GitHub Task Manager for MCPMark Evaluation Pipeline
====================================================
This module provides utilities for discovering, filtering, and managing
GitHub-based evaluation tasks.
The task manager is responsible for:
- Task discovery and filtering
- Task verification and result processing
- Task-specific logic (NOT LLM execution)
"""
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger
logger = get_logger(__name__)
@dataclass
class GitHubTask(BaseTask):
"""Represents a single evaluation task for GitHub service."""
# GitHub-specific fields
repository_url: Optional[str] = None
branch_name: Optional[str] = None
pr_number: Optional[int] = None
issue_number: Optional[int] = None
expected_actions: Optional[List[str]] = None # Expected GitHub actions to verify
# Directory-based task slug (e.g., "update_readme")
task_name: str = ""
# No need to override name property, inherited from BaseTask
class GitHubTaskManager(BaseTaskManager):
"""Manages task discovery, filtering, and verification for GitHub-based MCPMark evaluation."""
def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
"""Initialize GitHub task manager.
Args:
tasks_root: Path to the tasks directory
"""
if tasks_root is None:
tasks_root = Path(__file__).resolve().parents[3] / "tasks"
# Call parent constructor
super().__init__(
tasks_root,
mcp_service="github",
task_class=GitHubTask,
task_organization="file",
task_suite=task_suite,
) # GitHub uses file-based tasks
# =========================================================================
# Service-specific implementations
# =========================================================================
# No custom task discovery methods needed; relying entirely on BaseTaskManager defaults.
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> Optional[GitHubTask]:
"""Instantiate a GitHubTask from the dictionary yielded by _find_task_files."""
import json
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
final_category_id = category_id
task_id = task_files_info["task_id"]
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
return GitHubTask(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="github",
category_id=final_category_id,
task_id=task_id,
task_name=task_files_info["task_id"],
)
def _get_verification_command(self, task: GitHubTask) -> List[str]:
"""Get the verification command for GitHub tasks."""
return [sys.executable, str(task.task_verification_path)]
def get_task_instruction(self, task: GitHubTask) -> str:
"""Return task instruction prefixed with repository context.
Adds an English prefix to every GitHub task instruction so that the
agent knows **exactly** which repository to operate on, following the
pattern requested by the user:
Please execute the following task in my repository {owner}/{repo_name}:
If the repository URL has not yet been injected into the ``task`` (for
example when the state manager has not run), we fall back to a more
generic prefix without owner/repo placeholder.
"""
# Read the original task description first
base_instruction = task.get_task_instruction()
# Derive the owner/repo pair from the repository URL if available
prefix: str
if task.repository_url:
# Example URL: https://github.com/owner/repo_name.git (or without .git)
url_parts = task.repository_url.rstrip("/").replace(".git", "").split("/")
if len(url_parts) >= 2:
owner, repo_name = url_parts[-2], url_parts[-1]
prefix = f"Please execute the following task in my repository {owner}/{repo_name}:"
else:
prefix = "Please execute the following task:"
else:
prefix = "Please execute the following task:"
# Compose instruction with prefix
instruction_with_prefix = f"{prefix}\n\n{base_instruction.strip()}"
# Apply the common formatting suffix from base class
return self._format_task_instruction(instruction_with_prefix)
================================================
FILE: src/mcp_services/github/repo_exporter.py
================================================
"""
repo_exporter.py – Export public GitHub repository *and* open Issues/PRs
=====================================================================
Workflow
--------
1. Mirror-clone the public repository to a local bare repo directory
``${out_dir}/${owner}-${repo}/repo.git``.
2. Fetch all *open* Issues & Pull-Requests via GitHub REST API (no auth
needed for public repos, but a token can be provided to increase the rate
limit) and serialise them as JSON under the same folder:
• ``issues.json`` – list[Issue]
• ``pulls.json`` – list[PullRequest]
• ``meta.json`` – {"owner": owner, "repo": repo}
Usage (CLI)
-----------
$ python -m src.mcp_services.github.repo_exporter \
https://github.com/octocat/Hello-World \
--out-dir ./github_state
Optionally ``--token`` can be supplied (or env GITHUB_TOKEN) to avoid the
60-req/h anonymous limit.
"""
from __future__ import annotations
import json
import logging
import os
from dotenv import load_dotenv
import subprocess
from pathlib import Path
from tempfile import mkdtemp
from typing import Optional
from urllib.parse import urlparse
import requests
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
_API_ROOT = "https://api.github.com"
_DEFAULT_HEADERS = {
"Accept": "application/vnd.github.v3+json",
"User-Agent": "MCPMark/RepoExporter/1.0",
}
# ---------------------------------------------------------------------------
# Helper utilities
# ---------------------------------------------------------------------------
def _make_session(token: Optional[str] = None) -> requests.Session:
sess = requests.Session()
sess.headers.update(_DEFAULT_HEADERS)
if token:
sess.headers["Authorization"] = f"Bearer {token}"
return sess
def _parse_repo(url: str) -> tuple[str, str]:
parsed = urlparse(url)
parts = parsed.path.strip("/").split("/")
if len(parts) < 2:
raise ValueError(f"Invalid GitHub repo URL: {url}")
return parts[0], parts[1]
# ---------------------------------------------------------------------------
# Core export logic
# ---------------------------------------------------------------------------
def export_repository(
source_repo_url: str,
out_dir: str = "./github_state",
github_token: str | None = None,
max_issues: int | None = None,
max_pulls: int | None = None,
) -> str:
"""Export repository code plus Issues/PRs to ``out_dir``.
``max_issues`` / ``max_pulls`` – when supplied, export **only** the most
recently created *open* Issues or Pull Requests (respectively).
Returns the absolute path of the export folder.
"""
owner, repo = _parse_repo(source_repo_url)
export_root = Path(out_dir).expanduser().resolve()
repo_dir = export_root / f"{owner}-{repo}"
repo_dir.mkdir(parents=True, exist_ok=True)
# ------------------------------------------------------------------
# 1. Clone repository – full or shallow *working* clone (no bare repo)
# ------------------------------------------------------------------
repo_path = repo_dir / "repo"
if repo_path.exists():
logger.info("[clone] Repository already exists, skipping clone: %s", repo_path)
else:
logger.info("[clone] Cloning %s/%s to %s", owner, repo, repo_path)
env = {
**os.environ,
"GIT_TERMINAL_PROMPT": "0",
"GIT_LFS_SKIP_SMUDGE": "1",
}
tmp_dir = mkdtemp(prefix="mcp_export_")
try:
# Always perform a full clone (no shallow depth limitation).
clone_cmd = [
"git",
"clone",
"--no-single-branch",
f"https://github.com/{owner}/{repo}.git",
tmp_dir,
]
subprocess.run(clone_cmd, check=True, capture_output=True, env=env)
subprocess.run(["mv", tmp_dir, str(repo_path)], check=True)
logger.info("[clone] Clone completed")
finally:
# tmp_dir moved if success; remove if left
if os.path.isdir(tmp_dir):
subprocess.run(["rm", "-rf", tmp_dir])
# ------------------------------------------------------------------
# 2. Dump Issues & Pull Requests
# ------------------------------------------------------------------
sess = _make_session(github_token)
def _paginate(url: str, state: str = "all", extra_params: dict | None = None):
page = 1
while True:
params = {"state": state, "per_page": 100, "page": page}
if extra_params:
params.update(extra_params)
resp = sess.get(url, params=params)
if resp.status_code != 200:
logger.warning("Failed to list: %s – %s", url, resp.text)
break
items = resp.json()
if not items:
break
yield from items
page += 1
# --------------------------------------------------------------
# Helper: fetch all issue comments for a given issue / PR number
# --------------------------------------------------------------
def _fetch_issue_comments(number: int) -> list[dict]:
"""Return a list of {user, body} comment dicts for the given issue/PR."""
comments = []
for c in _paginate(
f"{_API_ROOT}/repos/{owner}/{repo}/issues/{number}/comments"
):
comments.append(
{
"user": c.get("user", {}).get("login", "unknown"),
"body": c.get("body", ""),
}
)
return comments
# --------------------------------------------------------------
# Helper: fetch all *review* comments (code comments) for a PR
# --------------------------------------------------------------
def _fetch_review_comments(number: int) -> list[dict]:
"""Return a list of {user, body} review comments for the given PR."""
comments = []
for c in _paginate(f"{_API_ROOT}/repos/{owner}/{repo}/pulls/{number}/comments"):
comments.append(
{
"user": c.get("user", {}).get("login", "unknown"),
"body": c.get("body", ""),
}
)
return comments
# Issues (non-PR)
issues = []
# If max_issues is 0, skip fetching issues entirely
if max_issues == 0:
logger.info("[export] Skipping issues (max_issues=0)")
else:
for itm in _paginate(
f"{_API_ROOT}/repos/{owner}/{repo}/issues",
extra_params={"sort": "created", "direction": "desc"},
):
if "pull_request" in itm:
continue
issues.append(
{
"title": itm.get("title"),
"body": itm.get("body", ""),
"labels": [lbl.get("name") for lbl in itm.get("labels", [])],
"state": itm.get("state", "open"), # Store issue state
"number": itm.get("number"), # Store issue number for reference
"comments": _fetch_issue_comments(itm.get("number")),
}
)
if max_issues is not None and len(issues) >= max_issues:
break
(repo_dir / "issues.json").write_text(json.dumps(issues, indent=2))
logger.info("[export] Saved %d issues", len(issues))
# Pull requests – include *all* PRs including those from forks
pulls = []
pr_head_refs: set[str] = set()
fork_pr_branches: dict[str, dict] = {} # Maps PR branch names to fork info
# If max_pulls is 0, skip fetching pull requests entirely
if max_pulls == 0:
logger.info("[export] Skipping pull requests (max_pulls=0)")
else:
for pr in _paginate(
f"{_API_ROOT}/repos/{owner}/{repo}/pulls",
state="open",
extra_params={"sort": "created", "direction": "desc"},
):
pr_number = pr.get("number")
head = pr.get("head", {})
if head is None:
logger.warning("PR #%s has no head (deleted fork), skipping", pr_number)
continue # skip PRs with missing head (deleted fork)
head_repo = head.get("repo")
head_ref = head.get("ref")
head_sha = head.get("sha")
if head_repo is None:
logger.warning("PR #%s source repo was deleted, skipping", pr_number)
continue # skip PRs where source repo was deleted
head_repo_full = head_repo.get("full_name")
is_from_fork = head_repo_full != f"{owner}/{repo}"
# Create PR data with fork information
pr_data = {
"number": pr_number,
"title": pr.get("title"),
"body": pr.get("body", ""),
"head": head_ref,
"base": pr.get("base", {}).get("ref"),
"is_from_fork": is_from_fork,
}
if is_from_fork:
# Store additional metadata for forked PRs
pr_data["fork_owner"] = head_repo.get("owner", {}).get("login")
pr_data["fork_repo"] = head_repo.get("name")
pr_data["head_sha"] = head_sha
# Create a unique branch name for this forked PR
fork_branch_name = f"pr/{pr_number}-{pr_data['fork_owner']}-{head_ref}"
pr_data["local_branch"] = fork_branch_name
fork_pr_branches[fork_branch_name] = {
"clone_url": head_repo.get("clone_url"),
"ref": head_ref,
"sha": head_sha,
"pr_number": pr_number,
}
else:
# For non-fork PRs, keep the original branch reference
pr_head_refs.add(head_ref)
# Attach comments
pr_data["comments"] = _fetch_issue_comments(pr_number)
pr_data["review_comments"] = _fetch_review_comments(pr_number)
pulls.append(pr_data)
if max_pulls is not None and len(pulls) >= max_pulls:
break
(repo_dir / "pulls.json").write_text(json.dumps(pulls, indent=2))
logger.info("[export] Saved %d pull requests", len(pulls))
# Get default branch info first (needed for fetching)
sess = _make_session(github_token)
try:
repo_info = sess.get(f"{_API_ROOT}/repos/{owner}/{repo}")
default_branch = repo_info.json().get("default_branch", "main")
except Exception:
default_branch = "main"
# Fetch branches from non-fork PRs (branches from the same repository)
non_fork_branches = list(pr_head_refs) # These are branches from the same repo
# Always include the default branch in the branches to fetch
if default_branch not in non_fork_branches:
non_fork_branches.append(default_branch)
pr_head_refs.add(default_branch)
if non_fork_branches:
logger.info(
"[fetch] Fetching %d branches from same repository (including default branch '%s')",
len(non_fork_branches),
default_branch,
)
try:
# Fetch all remote branches to ensure we have the PR branches
subprocess.run(
["git", "-C", str(repo_path), "fetch", "origin", "--no-tags"],
check=True,
capture_output=True,
)
# Create local branches for each PR branch
for branch in non_fork_branches:
try:
# Create local branch tracking the remote branch
subprocess.run(
[
"git",
"-C",
str(repo_path),
"branch",
"--track",
branch,
f"origin/{branch}",
],
check=False,
capture_output=True,
) # check=False because branch might already exist
logger.info("[fetch] Created local branch %s", branch)
except subprocess.CalledProcessError:
# Branch might already exist, which is fine
pass
except subprocess.CalledProcessError as e:
logger.warning(
"[fetch] Failed to fetch branches from origin: %s",
e.stderr.decode(errors="ignore") if e.stderr else str(e),
)
# Fetch branches from forks for PRs
if fork_pr_branches:
logger.info(
"[fetch] Fetching branches from %d forked PRs", len(fork_pr_branches)
)
for branch_name, fork_info in fork_pr_branches.items():
try:
logger.info(
"[fetch] Fetching branch %s from fork %s",
fork_info["ref"],
fork_info["clone_url"],
)
# Add fork as remote and fetch the specific branch
remote_name = f"fork-pr-{fork_info['pr_number']}"
# Add remote
subprocess.run(
[
"git",
"-C",
str(repo_path),
"remote",
"add",
remote_name,
fork_info["clone_url"],
],
check=True,
capture_output=True,
)
# Fetch the specific branch from the fork
subprocess.run(
[
"git",
"-C",
str(repo_path),
"fetch",
remote_name,
f"{fork_info['ref']}:refs/heads/{branch_name}",
],
check=True,
capture_output=True,
)
# Remove the remote after fetching
subprocess.run(
["git", "-C", str(repo_path), "remote", "remove", remote_name],
check=True,
capture_output=True,
)
# Add the fork branch to pr_head_refs so it gets pushed
pr_head_refs.add(branch_name)
logger.info("[fetch] Successfully fetched branch %s", branch_name)
except subprocess.CalledProcessError as e:
logger.warning(
"[fetch] Failed to fetch branch from fork PR #%s: %s",
fork_info["pr_number"],
e.stderr.decode(errors="ignore") if e.stderr else str(e),
)
except Exception as e:
logger.warning(
"[fetch] Unexpected error fetching fork PR #%s: %s",
fork_info["pr_number"],
str(e),
)
meta = {
"owner": owner,
"repo": repo,
"default_branch": default_branch,
"pr_head_refs": sorted(pr_head_refs),
}
(repo_dir / "meta.json").write_text(json.dumps(meta, indent=2))
logger.info("[done] Export finished – data stored at %s", repo_dir)
return str(repo_dir)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import argparse
load_dotenv(".mcp_env")
parser = argparse.ArgumentParser(
description="Export public GitHub repository with Issues/PRs"
)
parser.add_argument(
"--source_repo_url", required=True, help="HTTPS URL of the public repository"
)
parser.add_argument(
"--out-dir", default="./github_state", help="Output directory root"
)
parser.add_argument(
"--max-issues",
type=int,
default=20,
help="Export only the latest N issues (optional)",
)
parser.add_argument(
"--max-pulls",
type=int,
default=5,
help="Export only the latest N pull requests (optional)",
)
args = parser.parse_args()
token = os.getenv("GITHUB_TOKEN")
export_repository(
args.source_repo_url, args.out_dir, token, args.max_issues, args.max_pulls
)
================================================
FILE: src/mcp_services/github/repo_importer.py
================================================
"""
repo_importer.py – Restore previously exported GitHub repository into an org/user
===============================================================================
Given a local export folder created by ``repo_exporter.py`` that contains
``repo.git`` (bare mirror) and JSON files for Issues/PRs, this script:
1. Creates an empty repository under the specified owner (user/org) using the
provided GitHub token.
2. Pushes *all* Git history from the local bare repository to the target repo
(fallback to per-ref push to avoid timeouts).
3. Re-creates the open Issues & Pull Requests from the JSON dump.
CLI usage
---------
$ python -m src.mcp_services.github.repo_importer \
./github_template_repo/octocat-Hello-World \
--token YOUR_GH_PAT \
--target-owner EvalOrg \
--private
"""
from __future__ import annotations
import json
import logging
import os
import subprocess
import time
from pathlib import Path
from typing import Iterable
import requests
from dotenv import load_dotenv
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
_API_ROOT = "https://api.github.com"
_HEADERS = {
"Accept": "application/vnd.github.v3+json",
"User-Agent": "MCPMark/RepoImporter/1.0",
}
# ---------------------------------------------------------------------------
# Helper functions copied / simplified from repo_mirror (shallow clone logic removed)
# ---------------------------------------------------------------------------
def _make_session(token: str) -> requests.Session:
sess = requests.Session()
sess.headers.update(_HEADERS | {"Authorization": f"Bearer {token}"})
return sess
def _create_target_repo(
sess: requests.Session, owner: str, repo_name: str, description: str, private: bool
) -> str:
data = {
"name": repo_name,
"description": description,
"private": private,
"auto_init": False,
"has_issues": True,
"has_projects": True,
"has_wiki": False,
}
# Determine if owner == auth user
auth_user = _get_authenticated_user(sess)
create_url = (
f"{_API_ROOT}/user/repos"
if owner == auth_user
else f"{_API_ROOT}/orgs/{owner}/repos"
)
resp = sess.post(create_url, json=data)
if resp.status_code == 422 and "name already exists" in resp.text:
logger.warning("Repository already exists; attempting to delete and recreate …")
_delete_repo(sess, owner, repo_name)
resp = sess.post(create_url, json=data)
if resp.status_code not in (200, 201):
raise RuntimeError(f"Failed to create repo: {resp.status_code} {resp.text}")
html_url = resp.json()["html_url"]
logger.info("[init] Target repository created: %s", html_url)
return html_url
def _get_authenticated_user(sess: requests.Session) -> str:
resp = sess.get(f"{_API_ROOT}/user")
resp.raise_for_status()
return resp.json()["login"]
def _delete_repo(sess: requests.Session, owner: str, repo: str):
sess.delete(f"{_API_ROOT}/repos/{owner}/{repo}")
def _list_refs(repo_dir: str) -> list[str]:
result = subprocess.run(
["git", "-C", repo_dir, "for-each-ref", "--format=%(refname)"],
check=True,
capture_output=True,
text=True,
)
return result.stdout.strip().splitlines()
def _push_repo(
repo_path: Path,
owner: str,
repo_name: str,
token: str,
required_refs: Iterable[str] | None = None,
):
"""Push repository to GitHub.
Strategy:
1. Attempt a full `git push --mirror`.
2. If that fails (e.g. due to large repo), fall back to pushing refs one-by-one.
"""
dst_url = f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git"
# First try mirror push (fast path)
try:
subprocess.run(
["git", "-C", str(repo_path), "push", "--mirror", dst_url],
check=True,
capture_output=True,
)
logger.info("[push] Mirror push succeeded")
return
except subprocess.CalledProcessError as err:
logger.warning(
"[push] Mirror push failed (%s). Falling back to per-ref",
err.stderr.decode(errors="ignore"),
)
# ------------------------------------------------------------------
# Fallback: push each ref individually (robust but slower)
# ------------------------------------------------------------------
refs = required_refs or _list_refs(str(repo_path))
logger.info("[push] Pushing %d refs individually …", len(refs))
for ref in refs:
for attempt in range(3):
try:
subprocess.run(
["git", "-C", str(repo_path), "push", dst_url, f"{ref}:{ref}"],
check=True,
capture_output=True,
)
break
except subprocess.CalledProcessError as ref_err:
if attempt == 2:
raise RuntimeError(
f"Failed to push ref {ref}: {ref_err.stderr}"
) from ref_err
time.sleep(2 * (attempt + 1))
def _create_comment(
sess: requests.Session, owner: str, repo: str, issue_number: int, body: str
):
"""Create a comment on an Issue or Pull Request. Returns True on success."""
resp = sess.post(
f"{_API_ROOT}/repos/{owner}/{repo}/issues/{issue_number}/comments",
json={"body": body},
)
if resp.status_code not in (200, 201):
logger.debug("Failed to create comment on #%s: %s", issue_number, resp.text)
return False
return True
def _create_issue(
sess: requests.Session,
owner: str,
repo: str,
title: str,
body: str,
labels: list[str],
state: str = "open",
number: int = None,
):
"""Create a new Issue and return the *new* issue number (or None on failure)."""
data = {"title": title, "body": body, "labels": labels}
resp = sess.post(f"{_API_ROOT}/repos/{owner}/{repo}/issues", json=data)
if resp.status_code not in (200, 201):
logger.debug("Failed to create issue #%s: %s", number, resp.text)
return None
new_number = resp.json().get("number")
# Close issue if original state was closed
if state == "closed":
close_resp = sess.patch(
f"{_API_ROOT}/repos/{owner}/{repo}/issues/{new_number}",
json={"state": "closed"},
)
if close_resp.status_code not in (200, 201):
logger.debug("Failed to close issue #%s: %s", new_number, close_resp.text)
return new_number
def _create_pull(
sess: requests.Session,
owner: str,
repo: str,
title: str,
body: str,
head: str,
base: str,
pr_number: int = None,
):
"""Create a Pull Request and return the *new* PR number (or None on failure)."""
data = {"title": title, "body": body, "head": head, "base": base}
resp = sess.post(f"{_API_ROOT}/repos/{owner}/{repo}/pulls", json=data)
if resp.status_code not in (200, 201):
logger.warning(
"Failed to create PR #%s (head: %s, base: %s): %s",
pr_number,
head,
base,
resp.text,
)
return None
return resp.json().get("number")
def _enable_github_actions(sess: requests.Session, owner: str, repo_name: str):
"""Enable GitHub Actions for the repository using REST API."""
try:
url = f"{_API_ROOT}/repos/{owner}/{repo_name}/actions/permissions"
response = sess.put(url, json={"enabled": True, "allowed_actions": "all"})
if response.status_code in [200, 204]:
logger.info(
"Successfully enabled GitHub Actions for %s/%s", owner, repo_name
)
else:
logger.warning(
"Failed to enable GitHub Actions: %s %s",
response.status_code,
response.text,
)
except Exception as e:
logger.error("Failed to enable GitHub Actions: %s", e)
def _disable_repository_notifications(
sess: requests.Session, owner: str, repo_name: str
):
"""Disable repository notifications to prevent email spam."""
try:
url = f"{_API_ROOT}/repos/{owner}/{repo_name}/subscription"
response = sess.put(url, json={"subscribed": False, "ignored": True})
if response.status_code in [200, 201]:
logger.info(
"Successfully disabled notifications for %s/%s", owner, repo_name
)
elif response.status_code == 403:
# This is expected if the token doesn't have notifications scope
logger.debug(
"Cannot disable notifications for %s/%s (token lacks notifications scope - this is OK)",
owner,
repo_name,
)
else:
logger.warning(
"Failed to disable repository notifications: %s %s",
response.status_code,
response.text,
)
except Exception as e:
logger.error("Failed to disable repository notifications: %s", e)
def _set_default_branch(
sess: requests.Session, owner: str, repo_name: str, default_branch: str
):
"""Set the default branch for a repository."""
if default_branch != "main": # Only update if not already main
logger.info("[import] Setting default branch to '%s'", default_branch)
url = f"{_API_ROOT}/repos/{owner}/{repo_name}"
data = {"default_branch": default_branch}
resp = sess.patch(url, json=data)
if resp.status_code in (200, 201):
logger.info(
"[import] Successfully set default branch to '%s'", default_branch
)
else:
logger.warning(
"[import] Failed to set default branch: %s %s",
resp.status_code,
resp.text,
)
def _remove_github_directory(repo_path: Path, owner: str, repo_name: str, token: str):
"""Remove .github directory after pushing and commit the deletion."""
import shutil
github_dir = repo_path / ".github"
if github_dir.exists():
logger.info("[import] Removing .github directory after push …")
shutil.rmtree(github_dir)
# Commit the deletion
subprocess.run(
["git", "-C", str(repo_path), "add", "-A"], check=True, capture_output=True
)
subprocess.run(
["git", "-C", str(repo_path), "commit", "-m", "Remove .github directory"],
capture_output=True,
)
# Push the new commit
dst_url = f"https://x-access-token:{token}@github.com/{owner}/{repo_name}.git"
subprocess.run(
["git", "-C", str(repo_path), "push", dst_url],
check=True,
capture_output=True,
)
# ---------------------------------------------------------------------------
# Main import logic
# ---------------------------------------------------------------------------
def import_repository(
template_dir: str, github_token: str, target_owner: str, private: bool = False
) -> str:
"""Import repository from a local template directory to GitHub."""
# ------------------------------------------------------------------
# Ensure Git HTTP buffer large enough to avoid 400 errors on big pushes
# ------------------------------------------------------------------
try:
subprocess.run(
[
"git",
"config",
"--global",
"http.postBuffer",
"157286400", # 150 MiB
],
check=True,
capture_output=True,
)
logger.debug("[init] Set git http.postBuffer to 150MiB globally")
except subprocess.CalledProcessError as cfg_err:
logger.warning(
"[init] Failed to set http.postBuffer – proceeding anyway: %s",
cfg_err.stderr.decode(errors="ignore"),
)
tdir = Path(template_dir).expanduser().resolve()
meta = json.loads((tdir / "meta.json").read_text())
repo_name = meta["repo"]
pr_head_refs = meta.get("pr_head_refs", [])
default_branch = meta.get("default_branch", "main")
# Also include fork PR branches that were fetched
pulls = json.loads((tdir / "pulls.json").read_text())
fork_branches = [
pr["local_branch"]
for pr in pulls
if pr.get("is_from_fork", False) and "local_branch" in pr
]
needed_refs = (
[f"refs/heads/{default_branch}"]
+ [f"refs/heads/{h}" for h in pr_head_refs]
+ [f"refs/heads/{b}" for b in fork_branches]
)
sess = _make_session(github_token)
# 1. Create target repo
html_url = _create_target_repo(
sess, target_owner, repo_name, f"Restored mirror of {repo_name}", private
)
# 2. Push code
repo_path = tdir / "repo"
logger.info("[phase] Pushing git history …")
_push_repo(repo_path, target_owner, repo_name, github_token, needed_refs)
# Set the default branch if it's not 'main'
_set_default_branch(sess, target_owner, repo_name, default_branch)
# Remove .github directory right after pushing, before creating issues/PRs
_remove_github_directory(repo_path, target_owner, repo_name, github_token)
# 3. Re-create issues & PRs
logger.info("[phase] Re-creating issues …")
issues = json.loads((tdir / "issues.json").read_text())
created_issues = 0
for itm in issues:
new_issue_no = _create_issue(
sess,
target_owner,
repo_name,
itm["title"],
itm.get("body", ""),
itm.get("labels", []),
itm.get("state", "open"),
itm.get("number"),
)
if new_issue_no:
created_issues += 1
for c in itm.get("comments", []):
comment_body = f"*Original author: @{c['user']}*\n\n{c['body']}"
_create_comment(
sess, target_owner, repo_name, new_issue_no, comment_body
)
logger.info("[phase] Created %d out of %d issues", created_issues, len(issues))
logger.info("[phase] Re-creating pull requests …")
pulls = json.loads((tdir / "pulls.json").read_text())
created_prs = 0
skipped_prs = 0
for pr in pulls:
# Use local_branch for forked PRs, otherwise use original head
head_branch = pr.get("local_branch", pr["head"])
# Add note to PR body if it's from a fork
body = pr.get("body", "")
if pr.get("is_from_fork", False):
fork_note = f"\n\n---\n_This PR was originally from a fork: **{pr.get('fork_owner')}/{pr.get('fork_repo')}** (branch: `{pr['head']}`)_"
body = (
body + fork_note if body else fork_note[2:]
) # Remove leading newlines if body is empty
new_pr_number = _create_pull(
sess,
target_owner,
repo_name,
pr["title"],
body,
head_branch,
pr["base"],
pr.get("number"),
)
if new_pr_number:
created_prs += 1
for c in pr.get("comments", []):
comment_body = f"*Original author: @{c['user']}*\n\n{c['body']}"
_create_comment(
sess, target_owner, repo_name, new_pr_number, comment_body
)
for rc in pr.get("review_comments", []):
comment_body = (
f"*Original author: @{rc['user']}* (review)\n\n{rc['body']}"
)
_create_comment(
sess, target_owner, repo_name, new_pr_number, comment_body
)
else:
skipped_prs += 1
logger.info("[phase] Created %d PRs, skipped %d PRs", created_prs, skipped_prs)
# Enable GitHub Actions after creating issues and PRs
logger.info("[import] Enabling GitHub Actions …")
_enable_github_actions(sess, target_owner, repo_name)
# Disable notifications to prevent email spam
logger.info("[import] Disabling repository notifications …")
_disable_repository_notifications(sess, target_owner, repo_name)
logger.info("[done] Import complete: %s", html_url)
return html_url
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import argparse
load_dotenv(".mcp_env")
parser = argparse.ArgumentParser(
description="Import repository from local template into GitHub"
)
parser.add_argument("--template_dir", help="Path to exported template directory")
parser.add_argument(
"--target-owner",
"-o",
default="mcpmark-eval",
help="User or organisation that will own the new repository",
)
args = parser.parse_args()
token = os.getenv("GITHUB_TOKEN")
if not token:
parser.error("GITHUB_TOKEN not set in environment or .mcp_env")
# Always create the target repository as private
import_repository(args.template_dir, token, args.target_owner, True)
================================================
FILE: src/mcp_services/github/token_pool.py
================================================
"""
GitHub Token Pool Manager
=========================
Simple round-robin token pool for distributing API requests across multiple tokens
to avoid rate limit issues.
"""
from typing import List
from src.logger import get_logger
logger = get_logger(__name__)
class GitHubTokenPool:
"""
Manages a pool of GitHub tokens with round-robin selection.
"""
def __init__(self, tokens: List[str]):
"""
Initialize token pool.
Args:
tokens: List of GitHub personal access tokens
"""
if not tokens:
raise ValueError("Token pool must contain at least one token")
self.tokens = tokens
self.current_index = 0
logger.info(f"Initialized GitHub token pool with {len(tokens)} token(s)")
def get_next_token(self) -> str:
"""
Get the next token in round-robin fashion.
Returns:
The next GitHub token to use
"""
token = self.tokens[self.current_index]
self.current_index = (self.current_index + 1) % len(self.tokens)
return token
def get_current_token(self) -> str:
"""
Get the current token without advancing the index.
Returns:
The current GitHub token
"""
return self.tokens[self.current_index]
@property
def pool_size(self) -> int:
"""Get the number of tokens in the pool."""
return len(self.tokens)
================================================
FILE: src/mcp_services/insforge/__init__.py
================================================
"""Insforge MCP Service Implementation for MCPMark."""
================================================
FILE: src/mcp_services/insforge/insforge_login_helper.py
================================================
"""
Insforge Login Helper for MCPMark
==================================
Handles Insforge backend authentication and connection validation.
"""
import json
import requests
from pathlib import Path
from typing import Optional, Dict, Any
from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger
logger = get_logger(__name__)
class InsforgeLoginHelper(BaseLoginHelper):
"""Handles Insforge backend authentication and connection validation."""
def __init__(
self,
api_key: str,
backend_url: str,
state_path: Optional[Path] = None,
):
"""Initialize Insforge login helper.
Args:
api_key: Insforge backend API key for authentication
backend_url: Insforge backend URL (e.g., https://your-app.insforge.app)
state_path: Path to save connection state
"""
super().__init__()
self.api_key = api_key
self.backend_url = backend_url.rstrip('/')
self.state_path = state_path or Path.home() / ".mcpbench" / "insforge_auth.json"
# Ensure state directory exists
self.state_path.parent.mkdir(parents=True, exist_ok=True)
def login(self, **kwargs) -> bool:
"""Test Insforge backend connection and validate API key.
Returns:
bool: True if connection successful and API key valid
"""
try:
# Test 1: Basic connectivity - try to get backend metadata
logger.info(f"Testing connection to Insforge backend: {self.backend_url}")
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
# Test with a simple API endpoint - get current user or backend info
# Try the auth current session endpoint first
test_url = f"{self.backend_url}/api/auth/sessions/current"
response = requests.get(
test_url,
headers=headers,
timeout=10,
)
if response.status_code == 200:
# API key is valid and can authenticate
logger.info("✓ Insforge API key authentication successful")
connection_info = {
"backend_url": self.backend_url,
"authenticated": True,
"authenticated_at": self._get_current_timestamp(),
}
elif response.status_code == 401:
# Invalid API key
logger.error("✗ Invalid Insforge API key")
return False
else:
# API key might be admin key, try a different endpoint
# Try listing tables/backend metadata as a test
logger.info("Testing with backend metadata endpoint...")
# Simple connectivity test - just check if backend is reachable
health_url = f"{self.backend_url}/api/health"
try:
health_response = requests.get(health_url, timeout=5)
if health_response.status_code in [200, 404]: # 404 is ok, backend is reachable
logger.info("✓ Insforge backend is reachable")
connection_info = {
"backend_url": self.backend_url,
"api_key_type": "admin",
"authenticated": True,
"authenticated_at": self._get_current_timestamp(),
}
else:
logger.warning(f"Unexpected response from backend: {health_response.status_code}")
connection_info = {
"backend_url": self.backend_url,
"authenticated": True,
"authenticated_at": self._get_current_timestamp(),
}
except Exception as e:
logger.warning(f"Health check failed, but proceeding: {e}")
# Still consider it successful if we have credentials
connection_info = {
"backend_url": self.backend_url,
"authenticated": True,
"authenticated_at": self._get_current_timestamp(),
}
# Save connection state
self._save_connection_state(connection_info)
logger.info(f"Insforge backend connection validated: {self.backend_url}")
return True
except requests.exceptions.Timeout:
logger.error(f"Connection timeout to Insforge backend: {self.backend_url}")
return False
except requests.exceptions.ConnectionError:
logger.error(f"Cannot connect to Insforge backend: {self.backend_url}")
return False
except Exception as e:
logger.error(f"Unexpected error during Insforge authentication: {e}")
return False
def _save_connection_state(self, state: Dict[str, Any]):
"""Save connection state to file."""
try:
# Don't save API key
safe_state = {k: v for k, v in state.items() if k not in ["api_key", "access_token"]}
with open(self.state_path, "w") as f:
json.dump(safe_state, f, indent=2)
# Set restrictive permissions
self.state_path.chmod(0o600)
logger.info(f"Connection state saved to: {self.state_path}")
except Exception as e:
logger.error(f"Failed to save connection state: {e}")
def _get_current_timestamp(self) -> str:
"""Get current timestamp in ISO format."""
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()
def is_connected(self) -> bool:
"""Check if we can connect to Insforge backend."""
return self.login()
def get_connection_params(self) -> Dict[str, Any]:
"""Get connection parameters (without API key)."""
return {
"backend_url": self.backend_url,
}
================================================
FILE: src/mcp_services/insforge/insforge_state_manager.py
================================================
"""
Insforge State Manager for MCPMark
===================================
Manages backend state for Insforge tasks including setup via prepare_environment.py
and resource cleanup tracking.
"""
import os
import sys
import subprocess
import requests
from pathlib import Path
from typing import Optional, Dict, Any, List
from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
logger = get_logger(__name__)
class InsforgeStateManager(BaseStateManager):
"""Manages Insforge backend state for task evaluation."""
def __init__(
self,
api_key: str,
backend_url: str,
):
"""Initialize Insforge state manager.
Args:
api_key: Insforge backend API key for authentication
backend_url: Insforge backend URL (e.g., https://your-app.insforge.app)
"""
super().__init__(service_name="insforge")
self.api_key = api_key
self.backend_url = backend_url.rstrip('/')
# HTTP headers for API requests
self.headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json",
}
# Track current task context for agent configuration
self._current_task_context: Optional[Dict[str, Any]] = None
# Validate connection on initialization
try:
self._test_connection()
logger.info("Insforge state manager initialized successfully")
except Exception as e:
raise RuntimeError(f"Insforge initialization failed: {e}")
# Store baseline tables (system tables that exist before any tasks run)
self._baseline_tables = set(
(t['schema'], t['name']) for t in self._get_all_tables()
)
logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables")
def _test_connection(self):
"""Test backend connection."""
try:
# Simple connectivity test - try any endpoint
response = requests.get(
f"{self.backend_url}/api/health",
timeout=5,
)
# Any response (even 404) means backend is reachable
logger.debug(f"Insforge backend connectivity test: {response.status_code}")
except requests.exceptions.RequestException:
# Try with API key
try:
response = requests.get(
f"{self.backend_url}/api/auth/sessions/current",
headers=self.headers,
timeout=5,
)
logger.debug(f"Insforge backend auth test: {response.status_code}")
except Exception as inner_e:
raise RuntimeError(f"Cannot connect to Insforge backend: {inner_e}")
def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
"""Create initial backend state for a task.
Restores from backup which may place tables in public or task-specific schema.
Args:
task: Task for which to create initial state
Returns:
InitialStateInfo object or None if creation failed
"""
try:
# Generate unique state ID for this task run
state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}"
schema_name = task.category_id
logger.info(f"| Creating initial state for Insforge task: {task.name}")
# Drop schema first (cleanup from previous runs)
self._drop_schema(schema_name)
# Get list of existing tables before restore (to track what we create)
tables_before = self._get_all_tables()
logger.info(f"| Tables before restore: {len(tables_before)}")
# Note: Don't create schema here - pg_restore will create it from the backup
# Restore from backup if backup exists (may create tables in public or task schema)
if self._restore_from_backup(schema_name):
logger.info(f"| ✓ Restored '{schema_name}' from backup")
else:
logger.info(f"| ○ No backup found for '{schema_name}'")
# Run prepare_environment.py if it exists
task_prepared = self._run_prepare_environment(task)
if not task_prepared:
logger.debug(f"| No prepare_environment.py found for task {task.name}")
# Get list of tables after restore (to track what we need to clean up)
tables_after = self._get_all_tables()
# Track ALL new tables created by the restore (compare before/after)
tables_before_set = {(t['schema'], t['name']) for t in tables_before}
created_tables = [
t for t in tables_after
if (t['schema'], t['name']) not in tables_before_set
]
logger.info(f"| Tracked {len(created_tables)} new tables for cleanup")
for t in created_tables:
logger.debug(f"| - {t['schema']}.{t['name']}")
# Track the task context including created tables
context = {
"state_id": state_id,
"category_id": task.category_id,
"task_id": task.task_id,
"task_name": task.name,
"schema": schema_name,
"created_tables": created_tables, # Track all created tables
}
return InitialStateInfo(
state_id=state_id,
state_url=self.backend_url,
metadata=context,
)
except Exception as e:
logger.error(f"Failed to create initial state for {task.name}: {e}")
return None
def _store_initial_state_info(
self, task: BaseTask, state_info: InitialStateInfo
) -> None:
"""Store backend info in task object for agent access."""
if hasattr(task, "__dict__"):
task.backend_url = self.backend_url
task.api_key = self.api_key
task.state_id = state_info.state_id
# Store current task context for agent configuration
self._current_task_context = state_info.metadata
def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
"""Clean up task-specific resources.
Drops ALL tables created during task (both setup and agent-created)
by comparing against baseline.
Args:
task: Task whose initial state should be cleaned up
Returns:
True if cleanup successful
"""
try:
logger.info(f"| Cleaning up initial state for task: {task.name}")
if self._current_task_context:
schema_name = self._current_task_context.get("schema")
# Get ALL current tables
all_current_tables = self._get_all_tables()
# Find tables to drop: anything not in baseline
tables_to_drop = [
t for t in all_current_tables
if (t['schema'], t['name']) not in self._baseline_tables
]
logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)")
# Drop individual tables
for table_info in tables_to_drop:
try:
self._drop_table(table_info["schema"], table_info["name"])
logger.debug(f"| ✓ Dropped table: {table_info['schema']}.{table_info['name']}")
except Exception as e:
logger.warning(f"| Failed to drop table {table_info}: {e}")
# Drop the task schema (may be empty if all tables were in public)
if schema_name:
try:
self._drop_schema(schema_name)
logger.info(f"| ✓ Dropped schema: {schema_name}")
except Exception as e:
logger.warning(f"| Failed to drop schema {schema_name}: {e}")
# Clear task context
if self._current_task_context.get("task_name") == task.name:
self._current_task_context = None
logger.info(f"| ✓ Initial state cleanup completed for {task.name}")
return True
except Exception as e:
logger.error(f"Failed to cleanup task initial state for {task.name}: {e}")
return False
def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
"""Clean up a single tracked resource.
This is a placeholder for resource-specific cleanup logic.
Tasks should handle their own cleanup via cleanup scripts.
Args:
resource: Resource dictionary with type, id, and metadata
Returns:
True if cleanup successful
"""
resource_type = resource["type"]
resource_id = resource["id"]
logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)")
return True
def _run_prepare_environment(self, task: BaseTask) -> bool:
"""Run prepare_environment.py script if it exists in the task directory.
The script should use Insforge MCP tools or HTTP API to set up required state.
Args:
task: Task for which to prepare environment
Returns:
True if script ran successfully, False if script doesn't exist
"""
task_dir = task.task_instruction_path.parent
prepare_script = task_dir / "prepare_environment.py"
if not prepare_script.exists():
logger.debug(f"No prepare_environment.py found for task {task.name}")
return False
logger.info(f"| Running prepare_environment.py for task {task.name}")
# Set up environment variables for the script
env = os.environ.copy()
env.update({
"INSFORGE_BACKEND_URL": self.backend_url,
"INSFORGE_API_KEY": self.api_key,
})
try:
# Run the prepare_environment.py script
result = subprocess.run(
[sys.executable, str(prepare_script)],
cwd=str(task_dir), # Run from task directory
env=env,
capture_output=True,
text=True,
timeout=300, # 5 minute timeout
)
if result.returncode == 0:
logger.info(f"| ✓ Environment preparation completed for {task.name}")
if result.stdout.strip():
logger.debug(f"| prepare_environment.py output: {result.stdout}")
return True
else:
logger.error(f"| ✗ Environment preparation failed for {task.name}")
logger.error(f"| Error output: {result.stderr}")
raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}")
except subprocess.TimeoutExpired:
logger.error(f"✗ Environment preparation timed out for {task.name}")
raise RuntimeError("prepare_environment.py execution timed out")
except Exception as e:
logger.error(f"✗ Failed to run prepare_environment.py for {task.name}: {e}")
raise
def _get_timestamp(self) -> str:
"""Get timestamp for unique naming."""
from datetime import datetime
return datetime.now().strftime("%Y%m%d%H%M%S")
def _drop_schema(self, schema_name: str) -> None:
"""Drop schema and all its contents."""
import psycopg2
from psycopg2 import sql
conn_params = {
"host": "localhost",
"port": 5432,
"user": "postgres",
"password": "postgres",
"database": "insforge",
}
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
try:
with conn.cursor() as cur:
cur.execute(
sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(
sql.Identifier(schema_name)
)
)
logger.debug(f"| Dropped schema: {schema_name}")
finally:
conn.close()
def _create_schema(self, schema_name: str) -> None:
"""Create empty schema."""
import psycopg2
from psycopg2 import sql
conn_params = {
"host": "localhost",
"port": 5432,
"user": "postgres",
"password": "postgres",
"database": "insforge",
}
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
try:
with conn.cursor() as cur:
cur.execute(
sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name))
)
logger.debug(f"| Created schema: {schema_name}")
finally:
conn.close()
def _get_all_tables(self) -> List[Dict[str, str]]:
"""Get list of all user tables.
Returns:
List of dicts with 'schema' and 'name' keys
"""
import psycopg2
conn_params = {
"host": "localhost",
"port": 5432,
"user": "postgres",
"password": "postgres",
"database": "insforge",
}
conn = psycopg2.connect(**conn_params)
try:
with conn.cursor() as cur:
cur.execute("""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_type = 'BASE TABLE'
AND table_schema NOT IN ('information_schema', 'pg_catalog')
AND table_schema NOT LIKE 'pg_%'
AND table_name NOT LIKE '\\_%'
ORDER BY table_schema, table_name
""")
rows = cur.fetchall()
return [{"schema": row[0], "name": row[1]} for row in rows]
finally:
conn.close()
def _drop_table(self, schema_name: str, table_name: str) -> None:
"""Drop a specific table or materialized view."""
import psycopg2
from psycopg2 import sql
conn_params = {
"host": "localhost",
"port": 5432,
"user": "postgres",
"password": "postgres",
"database": "insforge",
}
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
try:
with conn.cursor() as cur:
# Try dropping as table first
cur.execute(
sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format(
sql.Identifier(schema_name),
sql.Identifier(table_name)
)
)
# Also try dropping as materialized view (in case agent created one)
cur.execute(
sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format(
sql.Identifier(schema_name),
sql.Identifier(table_name)
)
)
logger.debug(f"| Dropped table/view: {schema_name}.{table_name}")
finally:
conn.close()
def _restore_from_backup(self, category_name: str) -> bool:
"""Restore from backup file.
Tables may be restored into public schema or category-specific schema
depending on how the backup was created.
Args:
category_name: Name of category (e.g., 'employees', 'chinook', 'lego')
Returns:
True if backup was restored, False if no backup exists
"""
# Path to backup file
backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
backup_file = backup_dir / f"{category_name}.backup"
logger.debug(f"| Looking for backup at: {backup_file}")
logger.debug(f"| Backup exists: {backup_file.exists()}")
if not backup_file.exists():
logger.info(f"| ○ No backup file found: {backup_file}")
return False
logger.info(f"| Restoring {category_name} from backup...")
# Set up environment for pg_restore
env = os.environ.copy()
env["PGPASSWORD"] = "postgres"
try:
# Restore backup without schema filter (tables go to whatever schema they're in)
result = subprocess.run(
[
"pg_restore",
"-h", "localhost",
"-p", "5432",
"-U", "postgres",
"-d", "insforge",
"-v",
str(backup_file),
],
env=env,
capture_output=True,
text=True,
timeout=120, # 2 minute timeout
)
if result.returncode != 0 and "ERROR" in result.stderr:
logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}")
return False
logger.info(f"| ✓ {category_name} restored successfully")
return True
except subprocess.TimeoutExpired:
logger.error(f"| ✗ Restore timed out for {category_name}")
return False
except Exception as e:
logger.error(f"| ✗ Failed to restore {category_name}: {e}")
return False
def get_service_config_for_agent(self) -> dict:
"""Get configuration for agent execution.
This configuration is passed to the agent/MCP server so it can
connect to the Insforge backend.
Returns:
Dictionary containing backend URL and API key
"""
config = {
"backend_url": self.backend_url,
"api_key": self.api_key,
}
# Include current task context if available
if self._current_task_context:
config["task_context"] = self._current_task_context
return config
def set_verification_environment(self, messages_path: str = None) -> None:
"""Set environment variables needed for verification scripts.
Args:
messages_path: Optional path to messages.json file for verification
"""
os.environ["INSFORGE_BACKEND_URL"] = self.backend_url
os.environ["INSFORGE_API_KEY"] = self.api_key
# Set PostgreSQL connection details for direct database verification
# (Insforge exposes its internal postgres database for verification)
os.environ["POSTGRES_HOST"] = "localhost"
os.environ["POSTGRES_PORT"] = "5432"
os.environ["POSTGRES_DATABASE"] = "insforge"
os.environ["POSTGRES_USERNAME"] = "postgres"
os.environ["POSTGRES_PASSWORD"] = "postgres"
if messages_path:
os.environ["MCP_MESSAGES"] = str(messages_path)
logger.debug("Verification environment variables set for Insforge (including direct postgres access)")
================================================
FILE: src/mcp_services/insforge/insforge_task_manager.py
================================================
"""
Insforge Task Manager for MCPMark
===================================
Manages Insforge task discovery, execution, and verification.
"""
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger
logger = get_logger(__name__)
@dataclass
class InsforgeTask(BaseTask):
"""Insforge-specific task with backend information."""
task_name: str = ""
backend_url: Optional[str] = None
api_key: Optional[str] = None
class InsforgeTaskManager(BaseTaskManager):
"""Manages Insforge tasks for MCPMark evaluation."""
def __init__(self, tasks_root: Path = None):
"""Initialize Insforge task manager.
Args:
tasks_root: Path to tasks directory
"""
if tasks_root is None:
tasks_root = Path(__file__).resolve().parents[3] / "tasks"
super().__init__(
tasks_root,
mcp_service="insforge",
task_class=InsforgeTask,
task_organization="file", # Insforge uses file-based tasks
)
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> Optional[InsforgeTask]:
"""Instantiate an `InsforgeTask` from the dictionary returned by `_find_task_files`."""
import json
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
final_category_id = category_id
task_id = task_files_info["task_id"]
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
return InsforgeTask(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="insforge",
category_id=final_category_id,
task_id=task_id,
task_name=task_files_info["task_id"],
)
def _get_verification_command(self, task: InsforgeTask) -> List[str]:
"""Get verification command with Insforge backend info."""
cmd = [sys.executable, str(task.task_verification_path)]
return cmd
def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
"""Run verification with Insforge environment."""
env = os.environ.copy()
# Pass Insforge connection info to verification script
if hasattr(task, "backend_url") and task.backend_url:
env["INSFORGE_BACKEND_URL"] = task.backend_url
if hasattr(task, "api_key") and task.api_key:
env["INSFORGE_API_KEY"] = task.api_key
return subprocess.run(
self._get_verification_command(task),
capture_output=True,
text=True,
timeout=300,
env=env,
)
def _format_task_instruction(self, base_instruction: str) -> str:
"""Add Insforge-specific instructions."""
return (
base_instruction
+ "\n\nNote: Use Insforge MCP tools to complete this task. The backend connection is already configured."
)
================================================
FILE: src/mcp_services/notion/__init__.py
================================================
"""
Notion-specific modules for MCPMark.
"""
from .notion_task_manager import NotionTaskManager, NotionTask
from .notion_state_manager import NotionStateManager
__all__ = ["NotionTaskManager", "NotionTask", "NotionStateManager"]
================================================
FILE: src/mcp_services/notion/notion_login_helper.py
================================================
"""
Notion Login Helper for MCPMark
=================================
This module provides a utility class and CLI script for logging into Notion
using Playwright. It saves the authenticated session state to a file,
which can be used for subsequent automated tasks.
"""
import argparse
from pathlib import Path
from typing import Optional
from playwright.sync_api import (
BrowserContext,
Page,
TimeoutError as PlaywrightTimeoutError,
sync_playwright,
)
from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger
# Initialize logger
logger = get_logger(__name__)
class NotionLoginHelper(BaseLoginHelper):
"""
Utility helper for logging into Notion using Playwright.
"""
SUPPORTED_BROWSERS = {"chromium", "firefox"}
def __init__(
self,
*,
url: Optional[str] = None,
headless: bool = True,
state_path: Optional[str | Path] = None,
browser: str = "firefox",
) -> None:
"""
Initializes the Notion login helper.
Args:
url: The Notion URL to open after launching the browser.
headless: Whether to run Playwright in headless mode.
state_path: The path to save the authenticated session state.
browser: The browser engine to use ('chromium' or 'firefox').
"""
super().__init__()
if browser not in self.SUPPORTED_BROWSERS:
raise ValueError(
f"Unsupported browser '{browser}'. Supported browsers are: {', '.join(self.SUPPORTED_BROWSERS)}"
)
self.url = url or "https://www.notion.so/login"
self.headless = headless
self.browser_name = browser
self.state_path = (
Path(state_path or Path.cwd() / "notion_state.json").expanduser().resolve()
)
self._browser_context: Optional[BrowserContext] = None
self._playwright = None
self._browser = None
def login(self) -> BrowserContext:
"""
Launches a browser, performs login, and saves the session state.
"""
if self.state_path.exists():
try:
self.state_path.unlink()
except OSError as e:
logger.warning("Unable to remove existing state file: %s", e)
if self._playwright is None:
self._playwright = sync_playwright().start()
browser_type = getattr(self._playwright, self.browser_name)
self._browser = browser_type.launch(headless=self.headless)
context = self._browser.new_context()
page = context.new_page()
logger.info("Navigating to Notion URL: %s", self.url)
page.goto(self.url, wait_until="load")
if self.headless:
self._handle_headless_login(context)
else:
logger.info(
"A browser window has been opened. Please complete the Notion login."
)
logger.info(
"After you see your workspace, return to this terminal and press ."
)
initial_url = page.url
input()
try:
page.wait_for_url(lambda u: u != initial_url, timeout=10_000)
except PlaywrightTimeoutError:
pass # It's okay if the URL doesn't change
try:
page.wait_for_load_state("domcontentloaded", timeout=5_000)
except PlaywrightTimeoutError:
pass
context.storage_state(path=str(self.state_path))
logger.info("✅ Login successful! Session state saved to %s", self.state_path)
self._browser_context = context
return context
def close(self) -> None:
"""Closes the underlying browser and Playwright instance."""
if self._browser_context:
try:
self._browser_context.close()
finally:
self._browser_context = None
if self._browser:
try:
self._browser.close()
finally:
self._browser = None
if self._playwright:
self._playwright.stop()
self._playwright = None
def _handle_headless_login(self, context: BrowserContext) -> None:
"""
Guides the user through the login process in headless mode.
"""
page: Page = context.pages[0]
login_url = "https://www.notion.so/login"
page.goto(login_url, wait_until="domcontentloaded")
email = input("Enter your Notion email address: ").strip()
try:
email_input = page.locator(
'input[placeholder="Enter your email address..."]'
)
email_input.wait_for(state="visible", timeout=120_000)
email_input.fill(email)
email_input.press("Enter")
except PlaywrightTimeoutError:
raise RuntimeError("Timed out waiting for the email input field.")
except Exception:
page.get_by_role("button", name="Continue", exact=True).click()
try:
code_input = page.locator('input[placeholder="Enter code"]')
code_input.wait_for(state="visible", timeout=120_000)
code = input("Enter the verification code from your email: ").strip()
code_input.fill(code)
code_input.press("Enter")
except PlaywrightTimeoutError:
raise RuntimeError("Timed out waiting for the verification code input.")
except Exception:
page.get_by_role("button", name="Continue", exact=True).click()
try:
page.wait_for_url(lambda url: url != login_url, timeout=180_000)
except PlaywrightTimeoutError:
logger.warning("Login redirect timed out, but proceeding to save state.")
if self.url and self.url != login_url:
page.goto(self.url, wait_until="domcontentloaded")
def __enter__(self) -> "NotionLoginHelper":
self.login()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
def main():
"""Main entry point for the Notion login CLI script."""
parser = argparse.ArgumentParser(
description="Authenticate to Notion and generate a session state file.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--headless",
action="store_true",
help="Run the login flow in headless mode (prompts for credentials).",
)
parser.add_argument(
"--browser",
default="firefox",
choices=["chromium", "firefox"],
help="The browser engine to use for Playwright.",
)
args = parser.parse_args()
helper = NotionLoginHelper(headless=args.headless, browser=args.browser)
with helper:
logger.info("Login process completed.")
if __name__ == "__main__":
main()
================================================
FILE: src/mcp_services/notion/notion_state_manager.py
================================================
"""
Notion State Manager for MCPMark
=================================
This module handles the duplication and management of Notion initial states
Pages for consistent task evaluation using Playwright automation.
"""
import time
from pathlib import Path
from typing import Optional, Tuple, Dict, Any, Set
from notion_client import Client
from playwright.sync_api import (
Browser,
BrowserContext,
Page,
Playwright,
TimeoutError as PlaywrightTimeoutError,
sync_playwright,
)
from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
from src.mcp_services.notion.notion_task_manager import NotionTask
import re
# Initialize logger
logger = get_logger(__name__)
# Pattern to match orphan pages with "(n)" suffix, e.g., "Title (1)", "Title (2)"
ORPHAN_PAGE_PATTERN = re.compile(r".+\s+\(\d+\)$")
# Selectors for Notion UI elements
PAGE_MENU_BUTTON_SELECTOR = '[data-testid="more-button"], div.notion-topbar-more-button, [aria-label="More"], button[aria-label="More"]'
DUPLICATE_MENU_ITEM_SELECTOR = 'text="Duplicate"'
DUPLICATE_WITH_CONTENT_SELECTOR = 'text="Duplicate with content"'
MOVE_TO_MENU_ITEM_SELECTOR = 'text="Move to"'
MOVE_TO_SEARCH_INPUT_SELECTOR = (
'input[placeholder*="Move page to"], textarea[placeholder*="Move page to"]'
)
class NotionStateManager(BaseStateManager):
"""
Manages the state of Notion initial states using Playwright and the Notion API.
"""
def __init__(
self,
source_notion_key: str,
eval_notion_key: str,
headless: bool = True,
browser: str = "firefox",
eval_parent_page_title: str = "MCPMark Eval Hub",
source_parent_page_title: str = "MCPMark Source Hub",
):
"""
Initializes the Notion state manager.
Args:
source_notion_key: The Notion API key for source workspace.
eval_notion_key: The Notion API key for evaluation workspace.
headless: Whether to run Playwright in headless mode.
browser: The browser engine to use ('chromium' or 'firefox').
eval_parent_page_title: Parent page title for evaluation workspace.
"""
super().__init__(service_name="notion")
supported_browsers = {"chromium", "firefox"}
if browser not in supported_browsers:
raise ValueError(
f"Unsupported browser '{browser}'. Supported browsers are: {', '.join(supported_browsers)}"
)
self.browser_name = browser
# Initialize separate Notion clients with provided keys
if not source_notion_key or not eval_notion_key:
raise ValueError(
"Both source_notion_key and eval_notion_key must be provided to NotionStateManager."
)
self.source_notion_client = Client(auth=source_notion_key)
self.eval_notion_client = Client(auth=eval_notion_key)
self.headless = headless
self.state_file = Path("notion_state.json")
# Parent page under which duplicated pages should be moved for evaluation
self.eval_parent_page_title = eval_parent_page_title
# Source hub page that contains all initial-state templates
self.source_parent_page_title = source_parent_page_title
# Cache resolved parent page IDs to avoid repeated workspace-wide searches
self._eval_parent_page_id: Optional[str] = None
self._source_hub_page_id: Optional[str] = None
# Browser instance management for reuse within session
self._playwright: Optional[Playwright] = None
self._browser: Optional[Browser] = None
self._context: Optional[BrowserContext] = None
# Validate initialization
if not self.source_notion_client or not self.eval_notion_client:
raise ValueError(
"Both source_notion_key and eval_notion_key must be provided and valid"
)
if not self.state_file.exists():
raise FileNotFoundError(
"Authentication state 'notion_state.json' not found. Run the Notion login helper first."
)
logger.info("Notion state manager initialized successfully")
# =========================================================================
# Core Template Methods (Required by BaseStateManager)
# =========================================================================
def _cleanup_eval_hub_orphans(self) -> None:
"""Clean up all pages in MCPMark Eval Hub before creating new task state."""
try:
parent_page_id = self._ensure_eval_parent_page_id()
if not parent_page_id:
logger.debug(
"| ✗ Parent page '%s' not found in eval workspace, skipping cleanup",
self.eval_parent_page_title,
)
return
# Get all child pages and archive them
children = self.eval_notion_client.blocks.children.list(
block_id=parent_page_id
)
orphan_count = 0
for child in children.get("results", []):
if child.get("type") == "child_page":
try:
self.eval_notion_client.pages.update(
page_id=child["id"], archived=True
)
orphan_count += 1
logger.debug("| ✓ Archived orphan page: %s", child["id"])
except Exception as e:
logger.warning(
"| ✗ Failed to archive orphan page %s: %s", child["id"], e
)
if orphan_count > 0:
logger.info(
"| ✓ Cleaned up %d orphan page(s) from MCPMark Eval Hub", orphan_count
)
except Exception as e:
logger.warning("Orphan cleanup failed (non-critical, continuing): %s", e)
# Don't raise exception - allow execution to continue
def _cleanup_source_hub_orphans(self, exclude_page_ids: Optional[Set[str]] = None) -> int:
"""Clean up all orphan pages in source hub matching 'xxx (n)' pattern.
Args:
exclude_page_ids: Page IDs to exclude from cleanup (e.g., pages currently being operated on)
Returns:
Number of pages archived
"""
exclude_page_ids = exclude_page_ids or set()
source_hub_id = self._ensure_source_hub_page_id()
if not source_hub_id:
return 0
orphan_count = 0
next_cursor = None
try:
while True:
kwargs: Dict[str, Any] = {"block_id": source_hub_id}
if next_cursor:
kwargs["start_cursor"] = next_cursor
children = self.source_notion_client.blocks.children.list(**kwargs)
for child in children.get("results", []):
if child.get("type") != "child_page":
continue
child_id = child.get("id")
if child_id in exclude_page_ids:
continue
child_title = (child.get("child_page", {}) or {}).get("title", "").strip()
# Match "xxx (n)" pattern where n is any digit(s)
if ORPHAN_PAGE_PATTERN.match(child_title):
try:
self.source_notion_client.pages.update(
page_id=child_id, archived=True
)
orphan_count += 1
logger.info("| ✓ Archived source hub orphan: %s (%s)", child_title, child_id)
except Exception as e:
logger.warning("| ✗ Failed to archive orphan %s: %s", child_id, e)
if not children.get("has_more"):
break
next_cursor = children.get("next_cursor")
if orphan_count > 0:
logger.info("| ✓ Cleaned up %d orphan page(s) from source hub", orphan_count)
except Exception as e:
logger.warning("Source hub orphan cleanup failed (non-critical, continuing): %s", e)
return orphan_count
def _ensure_eval_parent_page_id(self) -> Optional[str]:
"""Resolve and cache the evaluation hub parent page ID."""
if self._eval_parent_page_id:
return self._eval_parent_page_id
try:
response = self.eval_notion_client.search(
query=self.eval_parent_page_title,
filter={"property": "object", "value": "page"},
)
for result in response.get("results", []):
props = result.get("properties", {})
title_prop = props.get("title", {}).get("title") or props.get(
"Name", {}
).get("title")
if not title_prop:
continue
title = "".join(t.get("plain_text", "") for t in title_prop).strip()
if title == self.eval_parent_page_title:
self._eval_parent_page_id = result.get("id")
break
if not self._eval_parent_page_id:
logger.debug(
"| ✗ Eval parent page '%s' not found via search",
self.eval_parent_page_title,
)
except Exception as e:
logger.error(
"| ✗ Failed to resolve eval parent page '%s': %s",
self.eval_parent_page_title,
e,
)
return self._eval_parent_page_id
def _ensure_source_hub_page_id(self) -> Optional[str]:
"""Resolve and cache the source hub parent page ID used for initial states."""
if self._source_hub_page_id:
return self._source_hub_page_id
try:
hub_search = self.source_notion_client.search(
query=self.source_parent_page_title,
filter={"property": "object", "value": "page"},
)
for result in hub_search.get("results", []):
props = result.get("properties", {})
title_prop = props.get("title", {}).get("title") or props.get(
"Name", {}
).get("title")
current_title = "".join(
t.get("plain_text", "") for t in (title_prop or [])
).strip()
if current_title == self.source_parent_page_title:
self._source_hub_page_id = result.get("id")
break
if not self._source_hub_page_id:
logger.error(
"| ✗ Source hub page '%s' not found.",
self.source_parent_page_title,
)
except Exception as e:
logger.error(
"| ✗ Failed to resolve source hub page '%s': %s",
self.source_parent_page_title,
e,
)
return self._source_hub_page_id
def _wait_for_database_ready(
self,
page_id: str,
max_retries: int = 10,
retry_delay: int = 2
) -> bool:
"""
Wait for the database backend to be ready by checking page accessibility.
Args:
page_id: The ID of the page to check
max_retries: Maximum number of retry attempts
retry_delay: Delay between retries in seconds
Returns:
True if the database is ready, False if timeout
"""
logger.info("| ○ Starting heartbeat detection for page %s", page_id)
for attempt in range(max_retries):
try:
# Try to retrieve the page from the evaluation workspace
result = self.eval_notion_client.pages.retrieve(page_id=page_id)
# Check if we got a valid response
if result and isinstance(result, dict):
# Additional check: try to get page properties
if "properties" in result:
logger.info(
"| ✓ Database backend is ready (attempt %d/%d)",
attempt + 1,
max_retries
)
return True
except Exception as e:
logger.debug(
"| ✗ Database not ready yet (attempt %d/%d): %s",
attempt + 1,
max_retries,
str(e)
)
# Wait before next retry
if attempt < max_retries - 1:
time.sleep(retry_delay)
logger.error(
"| ✗ Database backend failed to become ready after %d attempts",
max_retries
)
return False
def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
"""Create initial state by duplicating Notion page."""
if not isinstance(task, NotionTask):
logger.error("Task must be NotionTask for Notion state manager")
return None
# Clean up any orphan pages in eval hub before creating new state
self._cleanup_eval_hub_orphans()
# Clean up orphan pages in source hub before duplication
self._cleanup_source_hub_orphans()
try:
initial_state_title = self._category_to_initial_state_title(task.category_id)
initial_state_info = self._find_initial_state_by_title(initial_state_title)
if not initial_state_info:
logger.error(
"| ✗ Initial state not found for category '%s' (title: '%s')",
task.category_id,
initial_state_title,
)
return None
_, initial_state_url = initial_state_info
duplicated_url, duplicated_id = self._duplicate_initial_state_for_task(
initial_state_url, task.category_id, task.name
)
# Wait for database backend to be ready
logger.info("| ○ Checking database backend accessibility for duplicated page...")
if not self._wait_for_database_ready(duplicated_id):
logger.error(
"| ✗ Database backend is not accessible after duplication for task %s",
task.name
)
# Clean up the duplicated page if database is not ready
try:
self.eval_notion_client.pages.update(
page_id=duplicated_id, archived=True
)
logger.info("| ✓ Cleaned up inaccessible duplicated page: %s", duplicated_id)
except Exception as cleanup_error:
logger.error("| ✗ Failed to clean up duplicated page: %s", cleanup_error)
raise RuntimeError(
f"| ✗ Database backend failed to become ready for duplicated page {duplicated_id}"
)
time.sleep(5) # allow the page to fully load
return InitialStateInfo(
state_id=duplicated_id,
state_url=duplicated_url,
metadata={
"original_url": initial_state_url,
"category": task.category_id,
"task_name": task.name,
},
)
except Exception as e:
logger.error(f"| ✗ Failed to create initial state for {task.name}: {e}")
return None
def _store_initial_state_info(
self, task: BaseTask, state_info: InitialStateInfo
) -> None:
"""Store initial state information in NotionTask object."""
if isinstance(task, NotionTask):
task.duplicated_initial_state_id = state_info.state_id
task.duplicated_initial_state_url = state_info.state_url
task.original_initial_state_url = state_info.metadata.get("original_url")
# Track the duplicated page for cleanup
self.track_resource("page", state_info.state_id, state_info.metadata)
def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
"""Clean up initial state for a specific Notion task."""
if not isinstance(task, NotionTask):
return True # Nothing to clean up for non-Notion tasks
initial_state_id = task.duplicated_initial_state_id
if not initial_state_id:
logger.warning(
"| ✗ No duplicated initial state ID found for task %s, skipping cleanup.",
task.name,
)
return False
try:
# Archive the duplicated page
self.eval_notion_client.pages.update(
page_id=initial_state_id, archived=True
)
logger.info("| ✓ Archived page initial state: %s", initial_state_id)
# Remove from tracked resources to avoid duplicate cleanup
self.tracked_resources = [
r
for r in self.tracked_resources
if not (r["type"] == "page" and r["id"] == initial_state_id)
]
return True
except Exception as e:
logger.error("| ✗ Failed to archive initial state %s: %s", initial_state_id, e)
return False
def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
"""Clean up a single Notion resource."""
if resource["type"] == "page":
try:
self.eval_notion_client.pages.update(
page_id=resource["id"], archived=True
)
logger.info(f"| ✓ Archived Notion page: {resource['id']}")
return True
except Exception as e:
logger.error(f"| ✗ Failed to archive Notion page {resource['id']}: {e}")
return False
logger.warning(f"| ? Unknown resource type for cleanup: {resource['type']}")
return False
# =========================================================================
# Notion API Operations
# =========================================================================
def _rename_initial_state_via_api(
self, initial_state_id: str, new_title: str
) -> None:
"""Renames a Notion page using the API."""
try:
self.eval_notion_client.pages.update(
page_id=initial_state_id,
properties={"title": {"title": [{"text": {"content": new_title}}]}},
)
except Exception as e:
logger.error("| ✗ Failed to rename page via API: %s", e)
# ------------------------------------------------------------------
# Playwright helpers
# ------------------------------------------------------------------
def _ensure_browser(self) -> Tuple[Browser, BrowserContext]:
"""Ensure browser instance is available, reusing existing or creating new.
Returns:
Tuple of (Browser, BrowserContext)
"""
if self._playwright is None:
self._playwright = sync_playwright().start()
if self._browser is None:
browser_type = getattr(self._playwright, self.browser_name)
self._browser = browser_type.launch(headless=self.headless)
if self._context is None:
self._context = self._browser.new_context(
storage_state=str(self.state_file),
locale="en-US",
)
return self._browser, self._context
def close(self) -> None:
"""Clean up browser resources. Should be called when session ends."""
if self._context:
try:
# Save storage state before closing
self._context.storage_state(path=str(self.state_file))
self._context.close()
except Exception:
pass
self._context = None
if self._browser:
try:
self._browser.close()
except Exception:
pass
self._browser = None
if self._playwright:
try:
self._playwright.stop()
except Exception:
pass
self._playwright = None
def _recover_duplicate_via_ui(
self,
page: Page,
original_title: str,
*,
timeout: int = 30_000,
) -> Optional[str]:
"""Recover duplicate page URL by navigating via UI when API-based recovery fails.
This method navigates to the source hub and locates the duplicate page
(e.g., "Title (1)") in the Notion sidebar, then clicks on it to obtain
the URL directly from the browser.
Args:
page: The Playwright page instance
original_title: The original page title (without suffix)
timeout: Timeout for UI operations in milliseconds
Returns:
The URL of the duplicate page if found, None otherwise
"""
try:
source_hub_id = self._ensure_source_hub_page_id()
if not source_hub_id:
logger.warning("| ✗ Cannot resolve source hub for UI-based recovery")
return None
# Build URL to navigate to source hub
# Format: https://www.notion.so/
clean_hub_id = source_hub_id.replace("-", "")
source_hub_url = f"https://www.notion.so/{clean_hub_id}"
logger.info("| ○ Navigating to source hub for UI-based recovery...")
page.goto(source_hub_url, wait_until="domcontentloaded", timeout=60_000)
time.sleep(3) # Allow page to settle
# Look for page title with "(n)" suffix pattern in sidebar or page content
# The duplicate will be named "Original Title (1)" or similar
duplicate_pattern = re.compile(rf"^{re.escape(original_title)}\s*\(\d+\)$")
# Try to find the duplicate page in the page list/sidebar
# Notion uses different selectors for page links, try common patterns
page_link_selectors = [
f'a:has-text("{original_title} (1)")',
f'div[data-block-id]:has-text("{original_title} (1)")',
f'[role="treeitem"]:has-text("{original_title} (1)")',
]
for selector in page_link_selectors:
try:
locator = page.locator(selector).first
if locator.is_visible(timeout=5000):
logger.info("| ○ Found duplicate page in UI, clicking...")
locator.click()
page.wait_for_load_state("domcontentloaded", timeout=timeout)
time.sleep(3)
recovered_url = page.url
logger.info("| ✓ Recovered duplicate URL via UI: %s", recovered_url)
return recovered_url
except Exception:
continue
# If specific selectors didn't work, try a broader search
try:
# Look for any visible text matching the pattern and click it
all_text_elements = page.locator(f'text="{original_title} ("')
count = all_text_elements.count()
if count > 0:
for i in range(count):
element = all_text_elements.nth(i)
text_content = element.text_content() or ""
if duplicate_pattern.match(text_content.strip()):
logger.info("| ○ Found duplicate via text search, clicking...")
element.click()
page.wait_for_load_state("domcontentloaded", timeout=timeout)
time.sleep(3)
recovered_url = page.url
logger.info("| ✓ Recovered duplicate URL via UI text search: %s", recovered_url)
return recovered_url
except Exception as e:
logger.debug("| ✗ Broad text search failed: %s", e)
logger.warning("| ✗ Could not locate duplicate '%s (n)' in UI", original_title)
return None
except Exception as e:
logger.warning("| ✗ UI-based recovery failed: %s", e)
return None
# =========================================================================
# Playwright Automation Methods
# =========================================================================
def _move_current_page_to_env(
self, page: Page, *, wait_timeout: int = 60_000
) -> None:
"""Moves the currently open page into the designated evaluation parent page.
This operation is done via Playwright UI automation because the Notion API
does not yet expose a direct "move" endpoint for pages. It relies on the
following sequence:
1. Open the page action menu (same selector as duplication).
2. Choose the "Move to" menu item.
3. In the search field that appears (placeholder starts with
"Move page to"), type the target parent page title.
4. Click the matching search result to complete the move.
"""
logger.info(
"| ○ Moving duplicated page to evaluation parent '%s'...",
self.eval_parent_page_title,
)
try:
# Step 1: Open the page menu
page.wait_for_selector(
PAGE_MENU_BUTTON_SELECTOR, state="visible", timeout=30_000
)
page.click(PAGE_MENU_BUTTON_SELECTOR)
# Step 2: Select "Move to"
page.hover(MOVE_TO_MENU_ITEM_SELECTOR)
page.click(MOVE_TO_MENU_ITEM_SELECTOR)
# Step 3: Fill the destination title
page.wait_for_selector(
MOVE_TO_SEARCH_INPUT_SELECTOR, state="visible", timeout=15_000
)
# Ensure focus then type the destination title – using type() triggers
# key events Notion relies on for search filtering.
search_input = page.locator(MOVE_TO_SEARCH_INPUT_SELECTOR).first
search_input.click()
search_input.fill("") # Clear any residual text (safety)
search_input.type(self.eval_parent_page_title, delay=50)
# Step 4: Wait for the search result matching the page title, then click it
# Selector for the menu item row – ensure we click the outer container, not a nested
result_selector = (
f'div[role="menuitem"]:has-text("{self.eval_parent_page_title}")'
)
page.wait_for_selector(
result_selector, state="visible", timeout=wait_timeout
)
page.locator(result_selector).first.click(force=True)
# Wait for the dialog to disappear – indicates move finished
page.wait_for_selector(
MOVE_TO_SEARCH_INPUT_SELECTOR, state="detached", timeout=wait_timeout
)
# Give Notion a brief moment to process the move
time.sleep(3)
except PlaywrightTimeoutError as e:
logger.error(
"| ✗ Playwright timed out while moving page to evaluation parent – move may have failed."
)
raise RuntimeError("Playwright timeout during move-to operation") from e
except Exception as exc:
logger.error("| ✗ Unexpected error during move-to operation: %s", exc)
# Propagate the error to allow retry logic at higher level if necessary
raise
def _category_to_initial_state_title(self, category: str) -> str:
"""Converts a category name to a capitalized initial state title."""
return " ".join(word.capitalize() for word in category.split("_"))
def _extract_initial_state_id_from_url(self, url: str) -> str:
"""Extracts the initial state ID from a Notion URL."""
slug = url.split("?")[0].split("#")[0].rstrip("/").split("/")[-1]
compact = "".join(c for c in slug if c.isalnum())
if len(compact) < 32:
raise ValueError(f"Could not parse initial state ID from URL: {url}")
compact = compact[-32:]
return f"{compact[:8]}-{compact[8:12]}-{compact[12:16]}-{compact[16:20]}-{compact[20:]}"
# =========================================================================
# URL and State Utilities
# =========================================================================
def _get_slug_base(self, url: str) -> str:
"""Returns the slug part without its trailing 32-char ID (hyphen separated)."""
slug = url.split("?", 1)[0].split("#", 1)[0].rstrip("/").split("/")[-1]
match = re.match(r"^(.*)-([0-9a-fA-F]{32})$", slug)
if match:
return match.group(1)
return slug
def _is_valid_duplicate_url(self, original_url: str, duplicated_url: str) -> bool:
"""Checks whether duplicated_url looks like a Notion duplicate (original slug + '-N')."""
orig_base = self._get_slug_base(original_url)
dup_base = self._get_slug_base(duplicated_url)
if not dup_base.startswith(orig_base + "-"):
return False
suffix = dup_base[len(orig_base) + 1 :]
return suffix.isdigit()
def _find_initial_state_by_title(self, title: str) -> Optional[Tuple[str, str]]:
"""Find a child page under the source hub by exact title.
Strategy:
- Locate the source hub page ("MCPBench Source Hub") via search to get its ID.
- List its first-level children via `blocks.children.list`.
- Find a `child_page` whose title exactly matches `title`.
- Return the page ID and URL (retrieved via `pages.retrieve`).
"""
try:
# 1) Resolve the source hub page once and reuse its ID
source_hub_id = self._ensure_source_hub_page_id()
if not source_hub_id:
return None
# 2) List first-level children of the hub page and find exact title match
matched_child_id: Optional[str] = None
next_cursor = None
while True:
kwargs = {"block_id": source_hub_id}
if next_cursor:
kwargs["start_cursor"] = next_cursor
children = self.source_notion_client.blocks.children.list(**kwargs)
for child in children.get("results", []):
if child.get("type") != "child_page":
continue # Only consider child pages
child_title = (child.get("child_page", {}) or {}).get("title", "").strip()
if child_title == title:
matched_child_id = child.get("id")
break
if matched_child_id or not children.get("has_more"):
break
next_cursor = children.get("next_cursor")
if not matched_child_id:
logger.debug("| ✗ No child page titled '%s' under '%s'", title, self.source_parent_page_title)
return None
# 3) Retrieve the page to get its canonical URL
try:
page_obj = self.source_notion_client.pages.retrieve(page_id=matched_child_id)
page_url = page_obj.get("url")
except Exception as e:
logger.warning("| ✗ Failed to retrieve page URL for '%s' (%s): %s", title, matched_child_id, e)
page_url = None
if not page_url:
# Fall back to returning just the ID if URL couldn't be retrieved
logger.debug("| ○ Returning page ID without URL for '%s'", title)
return matched_child_id, ""
return matched_child_id, page_url
except Exception as e:
logger.error("| ✗ Error locating initial state '%s' via children listing: %s", title, e)
return None
# =========================================================================
# Duplication and State Management
# =========================================================================
# NOTE: Initial state type detection logic has been removed because all initial states are pages.
def _duplicate_current_initial_state(
self,
page: Page,
new_title: Optional[str] = None,
*,
original_initial_state_id: str,
original_initial_state_title: str,
wait_timeout: int = 180_000,
) -> str:
"""Duplicates the currently open Notion initial state using Playwright."""
try:
logger.info("| ○ Opening page menu...")
page.wait_for_selector(
PAGE_MENU_BUTTON_SELECTOR, state="visible", timeout=30_000
)
page.click(PAGE_MENU_BUTTON_SELECTOR)
logger.info("| ○ Clicking 'Duplicate'...")
page.hover(DUPLICATE_MENU_ITEM_SELECTOR)
page.click(DUPLICATE_MENU_ITEM_SELECTOR)
original_url = page.url
logger.info(
"| ○ Waiting for duplicated initial state to load (up to %.1f s)...",
wait_timeout / 1000,
)
page.wait_for_url(lambda url: url != original_url, timeout=wait_timeout)
# wait for the page to fully load
time.sleep(5)
duplicated_url = page.url
# Validate that the resulting URL is a genuine duplicate of the original template.
if not self._is_valid_duplicate_url(original_url, duplicated_url):
# Sometimes duplication succeeds but UI navigates to parent instead of the new page.
# In that case, try to find the most recently created page named exactly "
(1)".
logger.warning(
"| ✗ Duplicate URL pattern mismatch. Attempting recovery by searching for latest '%s (1)' page...",
original_initial_state_title,
)
target_title = f"{original_initial_state_title} (1)"
try:
# Wait 5 seconds before the first search to allow Notion to index the new page
time.sleep(5)
attempts = 3
source_hub_id = self._ensure_source_hub_page_id()
if not source_hub_id:
logger.error(
"| ✗ Cannot resolve source hub ID while locating '%s' duplicate.",
target_title,
)
else:
for retry_idx in range(attempts):
candidates = []
next_cursor = None
while True:
kwargs: Dict[str, Any] = {"block_id": source_hub_id}
if next_cursor:
kwargs["start_cursor"] = next_cursor
children = self.source_notion_client.blocks.children.list(**kwargs)
for child in children.get("results", []):
if child.get("type") != "child_page":
continue
child_id = child.get("id")
if child_id == original_initial_state_id:
continue
child_title = (
(child.get("child_page", {}) or {})
.get("title", "")
.strip()
)
if child_title != target_title:
continue
created_time = child.get("created_time") or child.get(
"last_edited_time"
)
candidates.append((created_time or "", child_id))
if not children.get("has_more"):
break
next_cursor = children.get("next_cursor")
if candidates:
latest_child_id = max(candidates, key=lambda x: x[0])[1]
fallback_url = None
try:
page_obj = self.source_notion_client.pages.retrieve(
page_id=latest_child_id
)
fallback_url = page_obj.get("url")
except Exception as retrieve_error:
logger.warning(
"| ✗ Failed to resolve URL for duplicate '%s': %s",
latest_child_id,
retrieve_error,
)
if fallback_url:
logger.info(
"| ○ Navigating directly to latest '%s' duplicate via children list...",
target_title,
)
page.goto(fallback_url, wait_until="domcontentloaded", timeout=120_000)
time.sleep(5)
duplicated_url = page.url
break
if retry_idx < attempts - 1:
logger.debug(
"| ○ '%s' not visible yet via children listing. Waiting 5s before retry %d/%d...",
target_title,
retry_idx + 1,
attempts - 1,
)
time.sleep(5)
# Re-validate after attempted recovery
if not self._is_valid_duplicate_url(original_url, duplicated_url):
# API-based recovery failed, try UI-based recovery as last resort
logger.warning(
"| ✗ API-based recovery failed. Trying UI-based recovery..."
)
ui_recovered_url = self._recover_duplicate_via_ui(
page,
original_initial_state_title,
timeout=wait_timeout,
)
if ui_recovered_url and self._is_valid_duplicate_url(original_url, ui_recovered_url):
duplicated_url = ui_recovered_url
logger.info("| ✓ UI-based recovery successful")
else:
logger.error(
"| ✗ Could not locate a valid '%s' duplicate after all recovery attempts.\n| Original: %s\n| Observed: %s",
target_title,
original_url,
duplicated_url,
)
# Attempt to clean up stray duplicate before propagating error.
self._cleanup_orphan_duplicate(
original_initial_state_id, original_initial_state_title
)
raise RuntimeError(
"Duplicate URL pattern mismatch – duplication likely failed"
)
except Exception as search_exc:
logger.error(
"| ✗ Failed during recovery search for '%s': %s",
target_title,
search_exc,
)
# Attempt to clean up stray duplicate before propagating error.
self._cleanup_orphan_duplicate(
original_initial_state_id, original_initial_state_title
)
raise RuntimeError(
"Duplicate URL pattern mismatch – duplication likely failed"
) from search_exc
duplicated_initial_state_id = self._extract_initial_state_id_from_url(
duplicated_url
)
# Always move to evaluation parent
self._move_current_page_to_env(page, wait_timeout=wait_timeout)
# Rename if new title is provided
if new_title:
self._rename_initial_state_via_api(
duplicated_initial_state_id, new_title
)
# verify whether the page is moved to the evaluation parent page
try:
result = self.eval_notion_client.pages.retrieve(
page_id=duplicated_initial_state_id
)
if not result or not isinstance(result, dict):
logger.error(
"| ✗ Playwright move to error: Notion API did not return a valid page dict after move."
)
raise RuntimeError(
"Playwright move to error: Notion API did not return a valid page dict after move."
)
logger.info(
"| ✓ Page moved to '%s' successfully.", self.eval_parent_page_title
)
except Exception as move_exc:
logger.error(f"Playwright move to error: {move_exc}")
raise RuntimeError(
"Playwright move to error: Notion client failed to retrieve page after move."
) from move_exc
return duplicated_initial_state_id
except PlaywrightTimeoutError as e:
logger.error("Playwright timed out while duplicating initial state.")
raise RuntimeError("Playwright timeout during duplication") from e
# =========================================================================
# Cleanup and Maintenance
# =========================================================================
def _cleanup_orphan_duplicate(
self,
original_initial_state_id: str,
initial_state_title: str,
) -> bool:
"""Finds and archives a stray duplicate ("orphan") that matches pattern 'Title (n)'.
Returns True if at least one orphan duplicate was archived.
"""
try:
source_hub_id = self._ensure_source_hub_page_id()
if not source_hub_id:
logger.error(
"| ✗ Cannot resolve source hub while cleaning up duplicates for '%s'",
initial_state_title,
)
return False
# Match any numbered duplicate "Title (n)" where n is any digit(s)
title_regex = re.compile(rf"^{re.escape(initial_state_title)}\s*\(\d+\)$")
archived_any = False
next_cursor = None
while True:
kwargs: Dict[str, Any] = {"block_id": source_hub_id}
if next_cursor:
kwargs["start_cursor"] = next_cursor
children = self.source_notion_client.blocks.children.list(**kwargs)
for child in children.get("results", []):
if child.get("type") != "child_page":
continue
dup_id = child.get("id")
if dup_id == original_initial_state_id:
continue
title_plain = (
(child.get("child_page", {}) or {}).get("title", "")
).strip()
if not title_regex.match(title_plain):
continue # not a numbered duplicate
try:
self.source_notion_client.pages.update(
page_id=dup_id, archived=True
)
logger.info("| ✓ Archived orphan duplicate (%s): %s", "page", dup_id)
archived_any = True
except Exception as exc:
logger.warning("| ✗ Failed to archive orphan page %s: %s", dup_id, exc)
if not children.get("has_more"):
break
next_cursor = children.get("next_cursor")
return archived_any
except Exception as exc:
logger.warning(
"Error while attempting to cleanup orphan duplicate: %s", exc
)
return False
def _duplicate_initial_state_for_task(
self,
initial_state_url: str,
category: str,
task_name: str,
*,
max_retries: int = 2,
initial_wait_ms: int = 180_000,
) -> Tuple[str, str]:
"""Duplicates an initial state for a task, with retries for reliability."""
if not self.state_file.exists():
raise FileNotFoundError(
"Authentication state 'notion_state.json' not found. "
"Run the Notion login helper first."
)
last_exc = None
for attempt in range(max_retries + 1):
wait_timeout = initial_wait_ms * (attempt + 1)
page = None
try:
# Reuse browser instance within session
_, context = self._ensure_browser()
page = context.new_page()
logger.info("| ○ Navigating to initial state for %s...", category)
# Start timing from the moment we begin navigating to the initial state page.
start_time = time.time()
page.goto(initial_state_url, wait_until="domcontentloaded", timeout=120_000)
context.storage_state(path=str(self.state_file))
initial_state_id = self._extract_initial_state_id_from_url(
initial_state_url
)
initial_state_title = self._category_to_initial_state_title(
category
)
duplicated_id = self._duplicate_current_initial_state(
page,
new_title=initial_state_title, # Use original initial state name without (1) suffix
original_initial_state_id=initial_state_id,
original_initial_state_title=initial_state_title,
wait_timeout=wait_timeout,
)
duplicated_url = page.url
# Validate URL pattern again at this higher level (should already be validated inside).
context.storage_state(path=str(self.state_file))
# Log how long the whole duplication (navigate → duplicate) took.
elapsed = time.time() - start_time
logger.info(
"| ✓ Initial state duplicated successfully in %.2f seconds (task: %s).",
elapsed,
task_name,
)
return duplicated_url, duplicated_id
except Exception as e:
# No additional cleanup here—handled inside _duplicate_current_template.
last_exc = e
if attempt < max_retries:
logger.warning(
"| ✗ Duplication attempt %d failed: %s. Retrying...",
attempt + 1,
e,
)
time.sleep(120 * attempt + 120)
finally:
# Close the page to prevent accumulation within reused context
if page:
try:
page.close()
except Exception:
pass
raise RuntimeError(
f"Initial state duplication failed for task '{task_name}' after {max_retries + 1} attempts: {last_exc}"
)
def get_service_config_for_agent(self) -> dict:
"""
Get service-specific configuration for agent execution.
Returns:
Dictionary containing configuration needed by the agent/MCP server
"""
from src.config.config_schema import ConfigRegistry
# Get the eval_api_key from config registry
config = ConfigRegistry.get_config("notion").get_all()
service_config = {}
if "eval_api_key" in config:
service_config["notion_key"] = config["eval_api_key"]
return service_config
================================================
FILE: src/mcp_services/notion/notion_task_manager.py
================================================
"""
Notion Task Manager for MCPMark Evaluation Pipeline
====================================================
This module provides utilities for discovering, filtering, and managing
evaluation tasks within the MCPMark project structure for Notion service.
The task manager is responsible for:
- Task discovery and filtering
- Task verification and result processing
- Task-specific logic (NOT LLM execution)
"""
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger
logger = get_logger(__name__)
@dataclass
class NotionTask(BaseTask):
"""Represents a single evaluation task for Notion service."""
# Additional Notion-specific fields
# A human-readable slug for the task directory (e.g. "employee_onboarding")
task_name: str = ""
original_initial_state_url: Optional[str] = None
duplicated_initial_state_url: Optional[str] = None
duplicated_initial_state_id: Optional[str] = None
def __post_init__(self):
# Ensure base class fields are set if not provided
if (
not hasattr(self, "task_instruction_path")
or self.task_instruction_path is None
):
self.task_instruction_path = self.description_path
if (
not hasattr(self, "task_verification_path")
or self.task_verification_path is None
):
self.task_verification_path = self.verify_path
@property
def description_path(self) -> Path:
"""Alias for task_instruction_path."""
return self.task_instruction_path
@property
def verify_path(self) -> Path:
"""Alias for task_verification_path."""
return self.task_verification_path
def get_description(self) -> str:
"""Read and return the task description."""
if self.description_path.exists():
return self.description_path.read_text(encoding="utf-8")
return ""
class NotionTaskManager(BaseTaskManager):
"""Manages task discovery, filtering, and verification for Notion-based MCPMark evaluation."""
def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
"""Initialize with the tasks directory path.
Args:
tasks_root: Path to the tasks directory
task_suite: Logical task suite (e.g., 'standard', 'easy')
"""
if tasks_root is None:
tasks_root = Path(__file__).resolve().parents[3] / "tasks"
# Call parent constructor
super().__init__(tasks_root, mcp_service="notion", task_suite=task_suite)
# =========================================================================
# Service-specific implementations for template methods
# =========================================================================
# No custom task discovery methods needed; relying entirely on BaseTaskManager defaults.
def _get_service_directory_name(self) -> str:
"""Return the service directory name for Notion."""
return "notion"
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> Optional[NotionTask]:
"""Instantiate a `NotionTask` from the dictionary returned by `_find_task_files`."""
import json
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
final_category_id = category_id
task_id = task_files_info["task_id"]
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
return NotionTask(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="notion",
category_id=final_category_id,
task_id=task_id,
task_name=task_files_info["task_id"],
)
def _get_verification_command(self, task: NotionTask) -> List[str]:
"""Get the verification command for Notion tasks.
Notion verification requires the duplicated template ID.
"""
return [
sys.executable,
str(task.task_verification_path),
task.duplicated_initial_state_id or "",
]
================================================
FILE: src/mcp_services/playwright/__init__.py
================================================
#!/usr/bin/env python3
"""
Playwright MCP Service for MCPMark
==================================
This package provides Playwright MCP integration for web automation tasks.
"""
================================================
FILE: src/mcp_services/playwright/playwright_login_helper.py
================================================
"""
Playwright Login Helper for MCPMark
====================================
This module provides browser session management and authentication utilities
for Playwright-based web automation tasks. Handles browser context setup,
session persistence, and state management.
"""
from pathlib import Path
from typing import Optional
from playwright.sync_api import (
BrowserContext,
sync_playwright,
)
from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger
logger = get_logger(__name__)
class PlaywrightLoginHelper(BaseLoginHelper):
"""
Login helper for Playwright web automation tasks.
Manages browser contexts, session persistence, and authentication state
for web automation scenarios.
"""
SUPPORTED_BROWSERS = {"chromium", "firefox"}
def __init__(
self,
*,
browser: str = "chromium",
headless: bool = True,
state_path: Optional[str | Path] = None,
) -> None:
"""
Initialize the Playwright login helper.
Args:
browser: Browser engine to use ('chromium' or 'firefox')
headless: Whether to run browser in headless mode
state_path: Path to save browser session state
"""
super().__init__()
if browser not in self.SUPPORTED_BROWSERS:
raise ValueError(
f"Unsupported browser '{browser}'. Supported: {', '.join(self.SUPPORTED_BROWSERS)}"
)
self.browser_name = browser
self.headless = headless
self.state_path = (
Path(state_path or Path.cwd() / "playwright_state.json")
.expanduser()
.resolve()
)
# Browser management
self._playwright = None
self._browser = None
self._browser_context: Optional[BrowserContext] = None
logger.info(f"Initialized PlaywrightLoginHelper with {browser} browser")
def login(self, **kwargs) -> bool:
"""
Set up browser context and session state.
For most Playwright tasks, this creates a clean browser context
that can be used for web automation. More complex authentication
can be handled in specific implementations.
Returns:
bool: True if browser setup successful
"""
try:
# Clean up any existing browser instances
self.close()
# Start Playwright
self._playwright = sync_playwright().start()
browser_type = getattr(self._playwright, self.browser_name)
self._browser = browser_type.launch(headless=self.headless)
# Create browser context
context_options = {}
# Load existing state if available
if self.state_path.exists():
try:
context_options["storage_state"] = str(self.state_path)
logger.info(f"Loaded browser state from {self.state_path}")
except Exception as e:
logger.warning(f"Failed to load browser state: {e}")
self._browser_context = self._browser.new_context(**context_options)
# Save current state
self._save_browser_state()
logger.info("✅ Browser context setup successful")
return True
except Exception as e:
logger.error(f"Browser setup failed: {e}")
self.close()
return False
def get_browser_context(self) -> Optional[BrowserContext]:
"""
Get the current browser context.
Returns:
BrowserContext or None if not initialized
"""
return self._browser_context
def is_authenticated(self) -> bool:
"""
Check if browser context is ready for use.
Returns:
bool: True if browser context is available
"""
return self._browser_context is not None
def get_credentials(self) -> dict:
"""
Get browser configuration for MCP integration.
Returns:
dict: Browser configuration parameters
"""
return {
"browser": self.browser_name,
"headless": self.headless,
"state_path": str(self.state_path),
}
def _save_browser_state(self) -> None:
"""Save current browser state to file."""
if self._browser_context:
try:
self._browser_context.storage_state(path=str(self.state_path))
logger.debug(f"Browser state saved to {self.state_path}")
except Exception as e:
logger.warning(f"Failed to save browser state: {e}")
def close(self) -> None:
"""Clean up browser resources."""
if self._browser_context:
try:
# Save state before closing
self._save_browser_state()
self._browser_context.close()
except Exception as e:
logger.warning(f"Error closing browser context: {e}")
finally:
self._browser_context = None
if self._browser:
try:
self._browser.close()
except Exception as e:
logger.warning(f"Error closing browser: {e}")
finally:
self._browser = None
if self._playwright:
try:
self._playwright.stop()
except Exception as e:
logger.warning(f"Error stopping Playwright: {e}")
finally:
self._playwright = None
================================================
FILE: src/mcp_services/playwright/playwright_state_manager.py
================================================
"""
Playwright State Manager for MCPMark
======================================
This module manages browser contexts and test environments for Playwright-based
web automation tasks. Handles browser isolation, test page setup, and cleanup.
"""
import time
from pathlib import Path
from typing import Optional, Dict, Any, List
from playwright.sync_api import (
BrowserContext,
Page,
TimeoutError as PlaywrightTimeoutError,
)
from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
logger = get_logger(__name__)
class PlaywrightStateManager(BaseStateManager):
"""
Manages browser state and test environments for Playwright tasks.
Provides browser context isolation, test page setup, and resource cleanup
for web automation evaluation.
"""
def __init__(
self,
browser: str = "chromium",
headless: bool = True,
state_path: Optional[Path] = None,
network_origins: str = "*",
user_profile: str = "isolated",
viewport_width: int = 1280,
viewport_height: int = 720,
):
"""
Initialize Playwright state manager.
Args:
browser: Browser engine to use ('chromium' or 'firefox')
headless: Whether to run browser in headless mode
state_path: Path to browser state file
network_origins: Allowed network origins (comma-separated or *)
user_profile: User profile type (isolated or persistent)
viewport_width: Browser viewport width
viewport_height: Browser viewport height
"""
super().__init__(service_name="playwright")
self.browser_name = browser
self.headless = headless
# self.headless = False
self.state_path = state_path or Path.cwd() / "playwright_state.json"
self.network_origins = network_origins
self.user_profile = user_profile
self.viewport_width = viewport_width
self.viewport_height = viewport_height
# Browser management
self._playwright = None
self._browser = None
self._current_context: Optional[BrowserContext] = None
# Task-specific tracking
self._current_task_pages: List[Page] = []
# Test environment URLs for different task categories
self.test_environments = {
"element_extraction": "https://mcp-eval-website.vercel.app/extraction",
"form_interaction": "https://mcp-eval-website.vercel.app/forms/",
"web_navigation": "https://mcp-eval-website.vercel.app/navigation",
"authentication": "https://mcp-eval-website.vercel.app/auth/turnstile",
}
logger.info("Playwright state manager initialized")
def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
"""
Create isolated browser context for task execution.
Args:
task: Task for which to create browser state
Returns:
InitialStateInfo with browser context details
"""
try:
logger.info(
"| Skipping Playwright browser launch – no initial browser state "
"needed for task: %s",
task.name,
)
# Generate a lightweight identifier to allow resource tracking even
# though no real browser context is created.
context_id = f"noop_{task.category_id}_{task.task_id}_{int(time.time())}"
# We still expose the canonical test URL (if any) because some
# consumers add it to the task metadata.
test_url = self.test_environments.get(task.category_id)
# Record a dummy resource so cleanup logic remains symmetrical.
self.track_resource(
"browser_context",
context_id,
{
"task_name": task.name,
"task_category": task.category_id,
"test_url": test_url,
},
)
return InitialStateInfo(
state_id=context_id,
state_url=test_url,
metadata={
"browser": self.browser_name,
"headless": self.headless,
"test_url": test_url,
"task_category": task.category_id,
},
)
except Exception as e:
logger.error(f"Failed to create stub initial state for {task.name}: {e}")
return None
def _store_initial_state_info(
self, task: BaseTask, state_info: InitialStateInfo
) -> None:
"""Store browser context information in task object."""
if hasattr(task, "__dict__"):
task.browser_context_id = state_info.state_id
task.test_url = state_info.state_url
task.browser_config = state_info.metadata
def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
"""Clean up browser context for specific task."""
try:
success = True
# Close any open pages
if self._current_task_pages:
for page in self._current_task_pages:
try:
page.close()
except Exception as e:
logger.warning(f"Failed to close page: {e}")
success = False
self._current_task_pages.clear()
# Close browser context
if self._current_context:
try:
self._current_context.close()
logger.info("Closed browser context")
except Exception as e:
logger.error(f"Failed to close browser context: {e}")
success = False
finally:
self._current_context = None
return success
except Exception as e:
logger.error(f"Error during browser cleanup for {task.name}: {e}")
return False
def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
"""Clean up a single browser resource."""
try:
if resource["type"] == "browser_context":
# Context cleanup is handled in _cleanup_task_initial_state
logger.debug(f"Browser context {resource['id']} marked for cleanup")
return True
logger.warning(f"Unknown resource type for cleanup: {resource['type']}")
return False
except Exception as e:
logger.error(f"Failed to cleanup resource {resource}: {e}")
return False
def _get_context_options(self, task: BaseTask) -> Dict[str, Any]:
"""Get browser context options based on task requirements."""
options = {
"viewport": {"width": self.viewport_width, "height": self.viewport_height}
}
# Load browser state if available
if self.state_path.exists():
try:
options["storage_state"] = str(self.state_path)
except Exception as e:
logger.warning(f"Failed to load browser state: {e}")
# Task-specific context options
if task.category_id == "form_interaction":
# Enable form interactions
options["permissions"] = ["geolocation"]
elif task.category_id == "web_navigation":
# Allow navigation between pages
options["accept_downloads"] = False
return options
def _setup_test_environment(self, task: BaseTask) -> Optional[str]:
"""Set up test environment for task category."""
try:
test_url = self.test_environments.get(task.category_id)
if not test_url:
logger.warning(
f"No test environment defined for category: {task.category_id}"
)
return None
# Create a page and navigate to test environment
if self._current_context:
page = self._current_context.new_page()
# Navigate to test URL to ensure it's accessible
page.goto(test_url, wait_until="networkidle", timeout=30000)
logger.info(f"Test environment ready: {test_url}")
# Track the page for cleanup
self._current_task_pages.append(page)
# Verify page loaded correctly
title = page.title()
if title:
logger.debug(f"Page loaded with title: {title}")
return test_url
except PlaywrightTimeoutError:
logger.error(f"Timeout loading test environment: {test_url}")
except Exception as e:
logger.error(f"Failed to setup test environment: {e}")
return None
def get_current_context(self) -> Optional[BrowserContext]:
"""Get the current browser context."""
return self._current_context
def get_test_page(self) -> Optional[Page]:
"""Get a page for testing (creates new one if needed)."""
if self._current_context:
try:
page = self._current_context.new_page()
self._current_task_pages.append(page)
return page
except Exception as e:
logger.error(f"Failed to create test page: {e}")
return None
def navigate_to_test_url(self, task: BaseTask) -> Optional[Page]:
"""Navigate to the test URL for a specific task."""
test_url = self.test_environments.get(task.category_id)
if not test_url:
logger.error(f"No test URL defined for category: {task.category_id}")
return None
page = self.get_test_page()
if page:
try:
page.goto(test_url, wait_until="networkidle", timeout=30000)
logger.info(f"Navigated to test URL: {test_url}")
return page
except Exception as e:
logger.error(f"Failed to navigate to {test_url}: {e}")
return None
def get_service_config_for_agent(self) -> dict:
"""
Get service-specific configuration for agent execution.
Returns:
Dictionary containing browser configuration for MCP server
"""
config = {
"browser": self.browser_name,
"headless": self.headless,
}
# Add browser state file if it exists
if self.state_path.exists():
config["browser_state"] = str(self.state_path)
# Add test environment URLs
config["test_environments"] = self.test_environments
return config
def close_all(self) -> None:
"""Close all browser resources."""
try:
# Close all pages
for page in self._current_task_pages:
try:
page.close()
except Exception:
pass
self._current_task_pages.clear()
# Close context
if self._current_context:
self._current_context.close()
self._current_context = None
# Close browser
if self._browser:
self._browser.close()
self._browser = None
# Stop Playwright
if self._playwright:
self._playwright.stop()
self._playwright = None
logger.info("All browser resources closed")
except Exception as e:
logger.error(f"Error closing browser resources: {e}")
def set_verification_environment(self, messages_path: str = None) -> None:
"""
Set Playwright-specific environment variables for verification scripts.
Args:
messages_path: Optional path to messages.json file for verification
"""
import os
# Set common MCP_MESSAGES if provided
if messages_path:
os.environ["MCP_MESSAGES"] = str(messages_path)
# Also set PLAYWRIGHT_WORK_DIR to the directory containing messages.json
work_dir = str(Path(messages_path).parent)
os.environ["PLAYWRIGHT_WORK_DIR"] = work_dir
logger.info(f"| Set PLAYWRIGHT_WORK_DIR to: {work_dir}")
logger.info(f"| Set MCP_MESSAGES to: {messages_path}")
def __del__(self):
"""Ensure cleanup on deletion."""
self.close_all()
================================================
FILE: src/mcp_services/playwright/playwright_task_manager.py
================================================
"""
Playwright Task Manager for MCPMark
====================================
Simple task manager for Playwright MCP tasks.
Follows anti-over-engineering principles: keep it simple, do what's needed.
"""
import sys
import os
import subprocess
from pathlib import Path
from typing import List, Dict, Any
from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger
logger = get_logger(__name__)
class PlaywrightTask(BaseTask):
"""Playwright-specific task that uses directory name as task name."""
class PlaywrightTaskManager(BaseTaskManager):
"""Simple task manager for Playwright MCP tasks."""
def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
"""Initialize with tasks directory."""
if tasks_root is None:
tasks_root = Path(__file__).resolve().parents[3] / "tasks"
super().__init__(
tasks_root,
mcp_service="playwright",
task_class=PlaywrightTask,
task_organization="directory",
task_suite=task_suite,
)
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> PlaywrightTask:
"""Instantiate a `PlaywrightTask` from the dictionary returned by `_find_task_files`."""
import json
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
final_category_id = category_id
task_id = task_files_info["task_id"]
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
return PlaywrightTask(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="playwright",
category_id=final_category_id,
task_id=task_id,
)
def _get_verification_command(self, task: BaseTask) -> List[str]:
"""Get verification command - just run the verify.py script."""
return [sys.executable, str(task.task_verification_path)]
def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
"""Run verification with Playwright-specific environment."""
env = os.environ.copy()
# Pass messages.json path and working directory to verification script
messages_path = os.getenv("MCP_MESSAGES")
work_dir = os.getenv("PLAYWRIGHT_WORK_DIR")
if messages_path:
env["MCP_MESSAGES"] = messages_path
logger.debug(f"Setting MCP_MESSAGES to: {messages_path}")
if work_dir:
env["PLAYWRIGHT_WORK_DIR"] = work_dir
logger.debug(f"Setting PLAYWRIGHT_WORK_DIR to: {work_dir}")
return subprocess.run(
self._get_verification_command(task),
capture_output=True,
text=True,
timeout=90,
env=env,
)
def _format_task_instruction(self, base_instruction: str) -> str:
"""Add Playwright-specific note to instructions."""
return (
base_instruction
+ "\n\nUse Playwright MCP tools to complete this web automation task."
)
================================================
FILE: src/mcp_services/playwright_webarena/playwright_login_helper.py
================================================
"""
WebArena (Docker) Login Helper for MCPMark
==========================================
This helper exposes basic browser configuration for agents. Authentication is
not required for the public WebArena environment; isolation is handled via
Docker containerization in the state manager.
"""
from __future__ import annotations
from pathlib import Path
from typing import Optional
from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger
logger = get_logger(__name__)
class PlaywrightLoginHelper(BaseLoginHelper):
"""
Minimal login helper. It does not launch browsers; that is handled by
the Playwright MCP client. It simply exposes configuration parameters such
as headless mode and an optional storage state file path.
"""
def __init__(
self,
*,
browser: str = "chromium",
headless: bool = True,
state_path: Optional[str | Path] = None,
base_url: Optional[str] = None,
) -> None:
super().__init__()
self.browser_name = browser
self.headless = headless
self.state_path = (
Path(state_path or Path.cwd() / "playwright_state.json")
.expanduser()
.resolve()
)
self.base_url = base_url
logger.info(
"Initialized WebArenaLoginHelper (browser=%s, headless=%s)",
browser,
headless,
)
def login(self, **kwargs) -> bool:
"""
No-op login. For WebArena we don't need credentials; we only provide
configuration for the MCP to open a browser.
"""
logger.info("WebArenaLoginHelper login: no-op")
return True
def is_authenticated(self) -> bool:
return True
def get_credentials(self) -> dict:
return {
"browser": self.browser_name,
"headless": self.headless,
"state_path": str(self.state_path),
"base_url": self.base_url,
}
def close(self) -> None:
# No resources to release
pass
================================================
FILE: src/mcp_services/playwright_webarena/playwright_state_manager.py
================================================
"""
WebArena (Docker) State Manager for MCPMark
===========================================
This module manages a WebArena environment that runs inside a Docker container.
It is responsible for starting the container in the initial state phase and
stopping/removing it during cleanup. It exposes the target URL (e.g.
http://localhost:9999) for Playwright MCP-based automation.
"""
from __future__ import annotations
import socket
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Dict, Any
from urllib.parse import urlparse
import requests
from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
logger = get_logger(__name__)
@dataclass
class DockerConfig:
image_name: str = "shopping_admin_final_0719"
image_tar_path: Optional[Path] = None
container_name: str = "shopping_admin"
host_port: int = 7780
container_port: int = 80
readiness_path: str = "/admin"
readiness_timeout_seconds: int = 600
readiness_poll_interval_seconds: float = 2.0
@property
def base_url(self) -> str:
return f"http://localhost:{self.host_port}"
class PlaywrightStateManager(BaseStateManager):
"""
Manage Docker lifecycle for WebArena-backed tasks.
- Initial state: ensure image is present (optionally load from tar), then
run container and wait until HTTP endpoint is ready.
- Cleanup: stop and remove the container.
"""
# Category-specific Docker configurations
CATEGORY_CONFIGS = {
"reddit": {
"image_name": "postmill-populated-exposed-withimg",
"container_name": "forum",
"host_port": 9999,
"readiness_path": "/"
},
"shopping": {
"image_name": "shopping_final_0712",
"container_name": "shopping",
"host_port": 7770,
"readiness_path": "/"
},
"shopping_admin": {
"image_name": "shopping_admin_final_0719",
"container_name": "shopping_admin",
"host_port": 7780,
"readiness_path": "/admin"
}
}
def __init__(
self,
*,
docker_image_name: str = "shopping_admin_final_0719",
docker_container_name: str = "shopping_admin",
host_port: int = 7780,
container_port: int = 80,
image_tar_path: Optional[str | Path] = None,
readiness_path: str = "/admin",
readiness_timeout_seconds: int = 600,
readiness_poll_interval_seconds: float = 2.0,
# Playwright browser config params (ignored by this state manager)
browser: Optional[str] = None,
headless: Optional[bool] = None,
network_origins: Optional[str] = None,
user_profile: Optional[str] = None,
viewport_width: Optional[int] = None,
viewport_height: Optional[int] = None,
# Debug mode - skip container cleanup
skip_cleanup: bool = False,
) -> None:
super().__init__(service_name="playwright_webarena")
self.config = DockerConfig(
image_name=docker_image_name,
image_tar_path=Path(image_tar_path).expanduser().resolve()
if image_tar_path
else None,
container_name=docker_container_name,
host_port=host_port,
container_port=container_port,
readiness_path=readiness_path,
readiness_timeout_seconds=readiness_timeout_seconds,
readiness_poll_interval_seconds=readiness_poll_interval_seconds,
)
self.skip_cleanup = skip_cleanup
logger.info(
"Initialized WebArenaStateManager (image=%s, container=%s, port=%s, skip_cleanup=%s)",
self.config.image_name,
self.config.container_name,
self.config.host_port,
self.skip_cleanup,
)
# ---- Helpers ---------------------------------------------------------
def _run_cmd(
self, args: list[str], *, check: bool = False
) -> subprocess.CompletedProcess:
logger.debug("| Running command: %s", " ".join(args))
return subprocess.run(
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=check
)
def _image_exists(self, image: str) -> bool:
result = self._run_cmd(
["docker", "images", "--format", "{{.Repository}}:{{.Tag}}"]
)
lines = [line.strip() for line in result.stdout.splitlines() if line.strip()]
# Parse target image (allow optional tag; default latest)
if ":" in image:
target_repo, target_tag = image.split(":", 1)
else:
target_repo, target_tag = image, "latest"
for repo_tag in lines:
if ":" in repo_tag:
repo, tag = repo_tag.split(":", 1)
else:
repo, tag = repo_tag, "latest"
if repo == target_repo and tag == target_tag:
logger.debug("| Found Docker image %s:%s", repo, tag)
return True
logger.debug("| Docker image not found: %s:%s", target_repo, target_tag)
return False
def _load_image_from_tar_if_needed(self) -> None:
if self.config.image_tar_path and not self._image_exists(
self.config.image_name
):
logger.info("| Loading Docker image from tar: %s", self.config.image_tar_path)
result = self._run_cmd(
["docker", "load", "--input", str(self.config.image_tar_path)]
)
if result.returncode != 0:
logger.error("| Failed to load Docker image: %s", result.stderr.strip())
raise RuntimeError(f"docker load failed: {result.stderr}")
logger.info("| Docker image loaded")
def _stop_and_remove_container(self, name: str) -> None:
# Stop (ignore errors if not running)
self._run_cmd(["docker", "stop", name])
# Remove (ignore errors if not exists)
self._run_cmd(["docker", "rm", name])
def _container_is_running(self, name: str) -> bool:
result = self._run_cmd(
["docker", "ps", "--filter", f"name=^{name}$", "--format", "{{.Names}}"]
)
running = any(line.strip() == name for line in result.stdout.splitlines())
logger.debug("| Container '%s' running: %s", name, running)
return running
def _port_open(self, host: str, port: int) -> bool:
try:
with socket.create_connection((host, port), timeout=1.0):
return True
except OSError:
return False
def _http_ready(self, url: str) -> bool:
try:
resp = requests.get(url, timeout=3)
return resp.status_code < 500
except Exception:
return False
def _get_entry_url(self) -> str:
base = self.config.base_url.rstrip("/")
path = self.config.readiness_path
if not path or path == "/":
return base
return f"{base}{path}"
def _wait_until_ready(self) -> bool:
deadline = time.time() + self.config.readiness_timeout_seconds
base_url = self.config.base_url.rstrip("/")
url = self._get_entry_url()
# Determine host and port from URL for port checks
parsed = urlparse(base_url)
host = parsed.hostname or "localhost"
port = parsed.port or self.config.host_port
# First wait for port to open to avoid long HTTP errors
while time.time() < deadline:
if self._port_open(host, port):
break
time.sleep(self.config.readiness_poll_interval_seconds)
while time.time() < deadline:
if self._http_ready(url):
logger.info("| WebArena HTTP endpoint ready: %s", url)
return True
time.sleep(self.config.readiness_poll_interval_seconds)
logger.error("| Timed out waiting for WebArena at %s", url)
return False
def _wait_for_mysql_ready(self, max_wait_seconds: int = 120) -> bool:
"""Wait for MySQL to be ready in the container."""
deadline = time.time() + max_wait_seconds
while time.time() < deadline:
result = self._run_cmd([
"docker", "exec", self.config.container_name,
"mysql", "-u", "magentouser", "-pMyPassword",
"magentodb", "-e", "SELECT 1;"
])
if result.returncode == 0:
logger.info("| MySQL is ready in container %s", self.config.container_name)
return True
time.sleep(2)
logger.warning("| MySQL not ready after %d seconds", max_wait_seconds)
return False
def _wait_for_magento_ready(self, max_wait_seconds: int = 180) -> bool:
"""Wait for Magento to be fully initialized."""
deadline = time.time() + max_wait_seconds
while time.time() < deadline:
# Check if Magento's setup is complete by trying to access config
result = self._run_cmd([
"docker", "exec", self.config.container_name,
"/var/www/magento2/bin/magento", "config:show", "web/unsecure/base_url"
])
if result.returncode == 0:
logger.info("| Magento is ready in container %s", self.config.container_name)
return True
time.sleep(5)
logger.warning("| Magento not ready after %d seconds", max_wait_seconds)
return False
def _configure_shopping_post_start(self) -> None:
"""Run Magento-specific steps for shopping container.
Waits for services to be ready before configuring.
"""
logger.info("| Running shopping post-start setup")
# Wait for MySQL to be ready first
if not self._wait_for_mysql_ready():
logger.warning("| MySQL not ready, attempting configuration anyway")
# Wait for Magento to be ready
if not self._wait_for_magento_ready():
logger.warning("| Magento not ready, attempting configuration anyway")
base_url = f"http://localhost:{self.config.host_port}"
cmds = [
[
"docker",
"exec",
self.config.container_name,
"/var/www/magento2/bin/magento",
"setup:store-config:set",
f"--base-url={base_url}",
],
[
"docker",
"exec",
self.config.container_name,
"mysql",
"-u",
"magentouser",
"-pMyPassword",
"magentodb",
"-e",
f"UPDATE core_config_data SET value='{base_url}/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');",
],
[
"docker",
"exec",
self.config.container_name,
"/var/www/magento2/bin/magento",
"cache:flush",
],
]
for cmd in cmds:
result = self._run_cmd(cmd)
if result.returncode != 0:
logger.warning(
"| Shopping setup step failed (%s): %s",
" ".join(cmd),
result.stderr.strip(),
)
else:
logger.debug(
"| Shopping setup step ok (%s): %s",
" ".join(cmd),
result.stdout.strip(),
)
def _configure_shopping_admin_post_start(self) -> None:
"""Run Magento-specific steps for shopping_admin container.
Waits for services to be ready before configuring.
"""
logger.info("| Running shopping_admin post-start setup")
# Wait for MySQL to be ready first
if not self._wait_for_mysql_ready():
logger.warning("| MySQL not ready, attempting configuration anyway")
# Wait for Magento to be ready
if not self._wait_for_magento_ready():
logger.warning("| Magento not ready, attempting configuration anyway")
base_url = f"http://localhost:{self.config.host_port}"
cmds = [
[
"docker",
"exec",
self.config.container_name,
"/var/www/magento2/bin/magento",
"setup:store-config:set",
f"--base-url={base_url}",
],
[
"docker",
"exec",
self.config.container_name,
"mysql",
"-u",
"magentouser",
"-pMyPassword",
"magentodb",
"-e",
f"UPDATE core_config_data SET value='{base_url}/' WHERE path IN ('web/secure/base_url', 'web/unsecure/base_url');",
],
[
"docker",
"exec",
self.config.container_name,
"/var/www/magento2/bin/magento",
"config:set",
"admin/security/password_is_forced",
"0",
],
[
"docker",
"exec",
self.config.container_name,
"/var/www/magento2/bin/magento",
"config:set",
"admin/security/password_lifetime",
"0",
],
[
"docker",
"exec",
self.config.container_name,
"/var/www/magento2/bin/magento",
"cache:flush",
],
]
for cmd in cmds:
result = self._run_cmd(cmd)
if result.returncode != 0:
logger.warning(
"| Shopping_admin setup step failed (%s): %s",
" ".join(cmd),
result.stderr.strip(),
)
else:
logger.debug(
"| Shopping_admin setup step ok (%s): %s",
" ".join(cmd),
result.stdout.strip(),
)
# ---- BaseStateManager hooks -----------------------------------------
def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
try:
# Dynamically update config based on task category
if hasattr(task, 'category_id') and task.category_id in self.CATEGORY_CONFIGS:
category_config = self.CATEGORY_CONFIGS[task.category_id]
logger.info(f"| Using category-specific config for '{task.category_id}': {category_config}")
# Update the config with category-specific values
self.config.image_name = category_config["image_name"]
self.config.container_name = category_config["container_name"]
self.config.host_port = category_config["host_port"]
self.config.readiness_path = category_config["readiness_path"]
# Ensure image exists (load from tar if configured)
self._load_image_from_tar_if_needed()
# Ensure any stale container is gone
self._stop_and_remove_container(self.config.container_name)
# Run container
run_cmd = [
"docker",
"run",
"--name",
self.config.container_name,
"-p",
f"{self.config.host_port}:{self.config.container_port}",
"-d",
self.config.image_name,
]
print("| Docker run command: ", run_cmd)
result = self._run_cmd(run_cmd)
if result.returncode != 0:
logger.error("| Failed to start container: %s", result.stderr.strip())
return None
container_id = result.stdout.strip()
logger.info(
"| Started container %s (%s)", self.config.container_name, container_id
)
# Special handling for shopping and shopping_admin
if self.config.container_name == "shopping":
self._configure_shopping_post_start()
if self.config.container_name == "shopping_admin":
self._configure_shopping_admin_post_start()
# Wait for readiness
if not self._wait_until_ready():
# Cleanup on failure
self._stop_and_remove_container(self.config.container_name)
return None
entry_url = self._get_entry_url()
# Track resource for cleanup
self.track_resource(
"docker_container",
self.config.container_name,
{
"image": self.config.image_name,
"host_port": self.config.host_port,
"container_port": self.config.container_port,
"base_url": entry_url,
},
)
# Provide initial state info
return InitialStateInfo(
state_id=self.config.container_name,
state_url=entry_url,
metadata={
"docker_image": self.config.image_name,
"container_name": self.config.container_name,
"host_port": self.config.host_port,
"container_port": self.config.container_port,
"base_url": entry_url,
"category": task.category_id,
},
)
except Exception as exc:
logger.error("| Failed to create WebArena initial state: %s", exc)
return None
def _store_initial_state_info(
self, task: BaseTask, state_info: InitialStateInfo
) -> None:
if hasattr(task, "__dict__"):
task.docker_container_name = state_info.state_id
task.base_url = state_info.state_url
task.docker_metadata = state_info.metadata
def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
if self.skip_cleanup:
logger.info("| Skipping container cleanup (skip_cleanup=True)")
logger.info("| Container is still running at: %s", self._get_entry_url())
logger.info(
"| To manually stop: docker stop %s && docker rm %s",
self.config.container_name,
self.config.container_name,
)
return True
try:
self._stop_and_remove_container(self.config.container_name)
return True
except Exception as exc:
logger.error("| Failed to cleanup container for %s: %s", task.name, exc)
return False
def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
if self.skip_cleanup:
logger.info(
"| Skipping resource cleanup for %s (skip_cleanup=True)",
resource.get("id"),
)
return True
try:
if resource.get("type") == "docker_container":
self._stop_and_remove_container(resource["id"])
return True
logger.warning(
"| Unknown resource type for cleanup: %s", resource.get("type")
)
return False
except Exception as exc:
logger.error("| Resource cleanup failed: %s", exc)
return False
def get_service_config_for_agent(self) -> dict:
"""
Provide configuration to the agent. The key piece is the base URL that
agents should navigate to when starting tasks.
"""
return {
"environment": "webarena-docker",
"base_url": self._get_entry_url(),
"docker": {
"image": self.config.image_name,
"container": self.config.container_name,
"host_port": self.config.host_port,
"container_port": self.config.container_port,
},
}
def close_all(self) -> None:
if self.skip_cleanup:
logger.info("| Skipping container cleanup in close_all (skip_cleanup=True)")
return
try:
self._stop_and_remove_container(self.config.container_name)
except Exception:
# Best effort
pass
def __del__(self) -> None:
if not self.skip_cleanup:
self.close_all()
================================================
FILE: src/mcp_services/playwright_webarena/playwright_task_manager.py
================================================
"""
WebArena Playwright Task Manager for MCPMark
============================================
Simple task manager for WebArena-backed Playwright MCP tasks.
"""
from __future__ import annotations
import sys
import os
import subprocess
from pathlib import Path
from typing import List, Dict, Any
from src.logger import get_logger
from src.base.task_manager import BaseTask, BaseTaskManager
logger = get_logger(__name__)
class PlaywrightTaskManager(BaseTaskManager):
"""Task manager for Playwright tasks against a WebArena environment."""
def __init__(
self,
tasks_root: Path | None = None,
base_url: str | None = None,
task_suite: str = "standard",
):
if tasks_root is None:
tasks_root = Path(__file__).resolve().parents[3] / "tasks"
super().__init__(
tasks_root,
mcp_service="playwright_webarena",
task_class=BaseTask,
task_organization="directory",
task_suite=task_suite,
)
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> BaseTask:
import json
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
final_category_id = category_id
task_id = task_files_info["task_id"]
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
task = BaseTask(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="playwright_webarena",
category_id=final_category_id,
task_id=task_id,
)
return task
# NEW: 注入统一前缀(基于 state manager 注入的 task.base_url)
def get_task_instruction(self, task: BaseTask) -> str:
base_instruction = task.get_task_instruction().strip()
base_url = getattr(task, "base_url", None)
prefix = f"Navigate to {base_url.rstrip('/')} and complete the following task."
# 前缀 + 原始任务说明
return self._format_task_instruction(f"{prefix}\n\n{base_instruction}")
def _get_verification_command(self, task: BaseTask) -> List[str]:
return [sys.executable, str(task.task_verification_path)]
# 将 base_url 通过环境变量传给 verify.py
def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
env = os.environ.copy()
base_url = getattr(task, "base_url", None)
if base_url:
env["WEBARENA_BASE_URL"] = base_url.rstrip("/")
return subprocess.run(
self._get_verification_command(task),
capture_output=True,
text=True,
timeout=300,
env=env,
)
def _format_task_instruction(self, base_instruction: str) -> str:
note = "Use Playwright MCP tools to complete this task."
return (base_instruction
+ "\n\n"
+ note + "\n\nNote: Based on your understanding, solve the task all at once by yourself, don't ask for my opinions on anything.")
================================================
FILE: src/mcp_services/playwright_webarena/reddit_env_setup.md
================================================
# WebArena Reddit环境搭建指南
本指南介绍如何搭建WebArena Reddit环境,用于Playwright MCP自动化测试。
## 系统要求
- Ubuntu 22.04+ 或其他Linux发行版
- Docker环境
- 至少50GB可用磁盘空间
- 至少4GB内存
## 快速设置步骤
### 1. 下载Reddit Docker镜像
WebArena提供3个镜像源,选择网络最快的:
```bash
# 选项1: Google Drive (通常最快)
pip install gdown
gdown 17Qpp1iu_mPqzgO_73Z9BnFjHrzmX9DGf
# 选项2: Archive.org
wget https://archive.org/download/webarena-env-forum-image/postmill-populated-exposed-withimg.tar
# 选项3: CMU服务器
wget http://metis.lti.cs.cmu.edu/webarena-images/postmill-populated-exposed-withimg.tar
```
### 2. 安装Docker (如果尚未安装)
```bash
sudo apt update
sudo apt install docker.io -y
sudo systemctl start docker
sudo systemctl enable docker
sudo usermod -aG docker $USER
newgrp docker
```
### 3. 启动Reddit环境
```bash
# 加载Docker镜像 (约50GB,需要等待几分钟)
docker load --input postmill-populated-exposed-withimg.tar
# 启动容器
docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg
# 等待服务启动 (约1-2分钟)
sleep 120
# 验证服务状态
docker logs forum | tail -10
curl -I http://localhost:9999
```
### 4. 验证环境
访问 `http://localhost:9999` 应该看到Postmill论坛主页,包含:
- 导航栏 (Forums, Wiki)
- 搜索框
- 登录/注册链接
- 论坛列表 (AskReddit, technology, gaming等)
## 端口开放策略
根据使用场景选择合适的端口开放策略:
### 策略1: GCP防火墙规则 (推荐 - 生产环境)
**适用场景**: 长期使用、团队协作、稳定的公共访问
```bash
# 安装gcloud CLI (如果尚未安装)
curl https://sdk.cloud.google.com | bash
exec -l $SHELL
# 认证
gcloud auth login
# 创建防火墙规则
gcloud compute firewall-rules create allow-reddit-9999 \
--allow tcp:9999 \
--source-ranges 0.0.0.0/0 \
--description "Allow access to WebArena Reddit on port 9999"
# 获取外部IP
gcloud compute instances list
```
**优点**: 永久有效、稳定、无额外依赖
**缺点**: 需要GCP权限、公网完全开放
### 策略2: ngrok隧道 (快速分享)
**适用场景**: 临时演示、快速测试、无需GCP权限
```bash
# 安装ngrok
wget https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
tar xvzf ngrok-v3-stable-linux-amd64.tgz
sudo mv ngrok /usr/local/bin
# 创建隧道
ngrok http 9999
```
**优点**: 即时生效、HTTPS支持、无需服务器配置
**缺点**: 临时URL、需要保持运行、免费版有限制
### 策略3: Cloudflared隧道 (免费持久)
**适用场景**: 长期免费使用、无需GCP、需要稳定访问
```bash
# 安装cloudflared
wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
sudo mv cloudflared-linux-amd64 /usr/local/bin/cloudflared
sudo chmod +x /usr/local/bin/cloudflared
# 创建临时隧道
cloudflared tunnel --url http://localhost:9999
# 或创建永久隧道 (需要Cloudflare账号)
cloudflared tunnel login
cloudflared tunnel create webarena-reddit
cloudflared tunnel route dns webarena-reddit reddit.yourdomain.com
```
**优点**: 免费、持久、自定义域名
**缺点**: 需要Cloudflare账号、设置稍复杂
### 策略4: SSH端口转发 (开发调试)
**适用场景**: 本地开发、安全要求高、团队内部访问
```bash
# 在本地机器上执行
ssh -L 8080:localhost:9999 user@your-server-ip
# 然后访问 http://localhost:8080
```
**优点**: 最安全、无需开放公网端口
**缺点**: 需要SSH访问、仅限本地使用
## Playwright MCP测试
环境搭建完成后,可以使用Playwright MCP进行自动化测试:
```javascript
// 基础连接测试
await page.goto('http://your-reddit-url:9999');
// 导航测试
await page.click('text=Forums');
await page.click('text=AskReddit');
// 表单交互测试
await page.click('text=Log in');
await page.fill('[placeholder="Username"]', 'testuser');
await page.fill('[placeholder="Password"]', 'testpass');
```
## 故障排除
### 容器启动失败
```bash
# 检查容器状态
docker ps -a
# 查看详细日志
docker logs forum
# 重启容器
docker restart forum
```
### 服务未就绪
```bash
# 检查PostgreSQL是否完全启动
docker logs forum | grep "database system is ready"
# 等待更长时间 (数据库恢复需要时间)
sleep 300
```
### 端口被占用
```bash
# 检查端口使用情况
netstat -tlnp | grep 9999
# 使用不同端口
docker run --name forum -p 8888:80 -d postmill-populated-exposed-withimg
```
## 环境重置
完成测试后重置环境:
```bash
# 停止并删除容器
docker stop forum
docker rm forum
# 重新启动
docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg
```
## 高级配置
### 环境变量设置 (WebArena标准)
```bash
export REDDIT="your-server-hostname:9999"
export REDDIT_URL="http://your-server-hostname:9999"
```
### 批量任务测试
```bash
# 准备WebArena测试配置
mkdir -p ~/.webarena
echo "REDDIT=your-server-hostname:9999" >> ~/.webarena/config
```
---
**注意**: 这个Reddit环境包含成千上万的预填充数据,完全模拟真实的Reddit使用场景,非常适合进行复杂的Web自动化任务测试。
================================================
FILE: src/mcp_services/postgres/__init__.py
================================================
"""
PostgreSQL MCP Service for MCPMark
===================================
This module provides PostgreSQL database integration for MCPMark evaluation.
"""
from .postgres_login_helper import PostgresLoginHelper
from .postgres_state_manager import PostgresStateManager
from .postgres_task_manager import PostgresTaskManager, PostgresTask
__all__ = [
"PostgresLoginHelper",
"PostgresStateManager",
"PostgresTaskManager",
"PostgresTask",
]
================================================
FILE: src/mcp_services/postgres/postgres_login_helper.py
================================================
"""
PostgreSQL Login Helper for MCPMark
====================================
Handles PostgreSQL authentication and connection validation.
"""
import json
import psycopg2
from pathlib import Path
from typing import Optional, Dict, Any
from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger
logger = get_logger(__name__)
class PostgresLoginHelper(BaseLoginHelper):
"""Handles PostgreSQL authentication and connection validation."""
def __init__(
self,
host: str = "localhost",
port: int = 5432,
database: str = "postgres",
username: str = "postgres",
password: str = None,
state_path: Optional[Path] = None,
):
"""Initialize PostgreSQL login helper.
Args:
host: Database host
port: Database port
database: Database name
username: Database username
password: Database password
state_path: Path to save connection state
"""
super().__init__()
self.host = host
self.port = port
self.database = database
self.username = username
self.password = password
self.state_path = state_path or Path.home() / ".mcpbench" / "postgres_auth.json"
# Ensure state directory exists
self.state_path.parent.mkdir(parents=True, exist_ok=True)
def login(self, **kwargs) -> bool:
"""Test PostgreSQL connection and save state.
Returns:
bool: True if connection successful
"""
try:
# Test connection
conn = psycopg2.connect(
host=self.host,
port=self.port,
database=self.database,
user=self.username,
password=self.password,
connect_timeout=10,
)
# Execute test query
with conn.cursor() as cur:
cur.execute("SELECT version()")
version = cur.fetchone()[0]
logger.info(f"PostgreSQL connection successful: {version}")
# Check permissions
cur.execute(
"""
SELECT has_database_privilege(%s, 'CREATE')
""",
(self.database,),
)
can_create = cur.fetchone()[0]
if not can_create:
logger.warning("User does not have CREATE privilege on database")
conn.close()
# Save connection state
self._save_connection_state(
{
"host": self.host,
"port": self.port,
"database": self.database,
"username": self.username,
"version": version,
"can_create": can_create,
"authenticated_at": self._get_current_timestamp(),
}
)
return True
except psycopg2.Error as e:
logger.error(f"PostgreSQL connection failed: {e}")
return False
except Exception as e:
logger.error(f"Unexpected error during PostgreSQL login: {e}")
return False
def _save_connection_state(self, state: Dict[str, Any]):
"""Save connection state to file."""
try:
# Don't save password
safe_state = {k: v for k, v in state.items() if k != "password"}
with open(self.state_path, "w") as f:
json.dump(safe_state, f, indent=2)
# Set restrictive permissions
self.state_path.chmod(0o600)
logger.info(f"Connection state saved to: {self.state_path}")
except Exception as e:
logger.error(f"Failed to save connection state: {e}")
def _get_current_timestamp(self) -> str:
"""Get current timestamp in ISO format."""
from datetime import datetime, timezone
return datetime.now(timezone.utc).isoformat()
def is_connected(self) -> bool:
"""Check if we can connect to PostgreSQL."""
return self.login()
def get_connection_params(self) -> Dict[str, Any]:
"""Get connection parameters (without password)."""
return {
"host": self.host,
"port": self.port,
"database": self.database,
"user": self.username,
}
================================================
FILE: src/mcp_services/postgres/postgres_state_manager.py
================================================
"""
PostgreSQL State Manager for MCPMark
=====================================
Manages database state for PostgreSQL tasks including schema setup,
test data creation, and cleanup.
"""
import os
import subprocess
import sys
import psycopg2
from psycopg2 import sql
from pathlib import Path
from typing import Optional, Dict, Any, List
from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
logger = get_logger(__name__)
class PostgresStateManager(BaseStateManager):
"""Manages PostgreSQL database state for task evaluation."""
def __init__(
self,
host: str = "localhost",
port: int = 5432,
database: str = "postgres",
username: str = "postgres",
password: str = None,
):
"""Initialize PostgreSQL state manager.
Args:
host: Database host
port: Database port
database: Main database name
username: Database username
password: Database password
template_db: Template database for initial states
"""
super().__init__(service_name="postgres")
self.host = host
self.port = port
self.database = database
self.username = username
self.password = password
# Connection parameters
self.conn_params = {
"host": host,
"port": port,
"user": username,
"password": password,
}
# Track created databases for cleanup
self.created_databases: List[str] = []
# Track current task database for agent configuration
self._current_task_database: Optional[str] = None
# Validate connection on initialization
try:
self._test_connection()
logger.info("PostgreSQL state manager initialized successfully")
self._setup_database()
except Exception as e:
raise RuntimeError(f"PostgreSQL initialization failed: {e}")
def _test_connection(self):
"""Test database connection."""
conn = psycopg2.connect(**self.conn_params, database="postgres")
conn.close()
def _setup_database(self):
"""Setup all required databases by downloading and restoring from backup."""
databases = ['employees', 'chinook', 'dvdrental', 'sports', 'lego']
for db_name in databases:
if not self._database_exists(db_name):
logger.info(f"Setting up {db_name} database...")
# Path to backup file
backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
backup_file = backup_dir / f"{db_name}.backup"
# Download backup if not exists
if not backup_file.exists():
backup_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Downloading {db_name} backup...")
try:
import urllib.request
urllib.request.urlretrieve(
f'https://storage.mcpmark.ai/postgres/{db_name}.backup',
str(backup_file)
)
logger.info(f"{db_name} backup downloaded")
except Exception as e:
logger.warning(f"Failed to download {db_name} backup: {e}")
continue
# Create database
try:
self._create_empty_database(db_name)
logger.info(f"Created {db_name} database")
except Exception as e:
logger.warning(f"Failed to create {db_name} database: {e}")
continue
# Restore from backup
env = os.environ.copy()
env['PGPASSWORD'] = self.password
try:
result = subprocess.run([
'pg_restore',
'-h', str(self.host),
'-p', str(self.port),
'-U', self.username,
'-d', db_name,
'-v',
str(backup_file)
], env=env, capture_output=True, text=True)
if result.returncode != 0:
logger.warning(f"pg_restore had errors for {db_name}: {result.stderr}")
else:
logger.info(f"{db_name} database restored successfully")
except Exception as e:
logger.warning(f"Failed to restore {db_name} database: {e}")
else:
logger.debug(f"{db_name} database already exists")
def _setup_database(self):
"""Setup all required databases by downloading and restoring from backup."""
databases = ['employees', 'chinook', 'dvdrental', 'sports', 'lego']
for db_name in databases:
if not self._database_exists(db_name):
logger.info(f"Setting up {db_name} database...")
# Path to backup file
backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
backup_file = backup_dir / f"{db_name}.backup"
# Download backup if not exists
if not backup_file.exists():
backup_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Downloading {db_name} backup...")
try:
import urllib.request
urllib.request.urlretrieve(
f'https://storage.mcpmark.ai/postgres/{db_name}.backup',
str(backup_file)
)
logger.info(f"{db_name} backup downloaded")
except Exception as e:
logger.warning(f"Failed to download {db_name} backup: {e}")
continue
# Create database
try:
self._create_empty_database(db_name)
logger.info(f"Created {db_name} database")
except Exception as e:
logger.warning(f"Failed to create {db_name} database: {e}")
continue
# Restore from backup
env = os.environ.copy()
env['PGPASSWORD'] = self.password
try:
result = subprocess.run([
'pg_restore',
'-h', str(self.host),
'-p', str(self.port),
'-U', self.username,
'-d', db_name,
'-v',
str(backup_file)
], env=env, capture_output=True, text=True)
if result.returncode != 0 and "ERROR" in result.stderr:
logger.warning(f"pg_restore had errors for {db_name}: {result.stderr}")
else:
logger.info(f"{db_name} database restored successfully")
except Exception as e:
logger.warning(f"Failed to restore {db_name} database: {e}")
else:
logger.debug(f"{db_name} database already exists")
def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
"""Create initial database state for a task."""
try:
# Generate unique database name
db_name = f"mcpmark_{task.category_id}_{task.task_id}_{self._get_timestamp()}"
# Create database from template if exists, otherwise empty
if self._database_exists(task.category_id):
self._create_database_from_template(db_name, task.category_id)
logger.info(
f"| Created database '{db_name}' from template '{task.category_id}'"
)
else:
self._create_empty_database(db_name)
logger.info(f"| Created empty database '{db_name}'")
# Run prepare_environment.py if it exists
self._run_prepare_environment(db_name, task)
logger.info(f"| Prepared environment for database '{db_name}'")
# Track for cleanup
self.created_databases.append(db_name)
self.track_resource("database", db_name, {"task": task.name})
return InitialStateInfo(
state_id=db_name,
state_url=f"postgresql://{self.username}@{self.host}:{self.port}/{db_name}",
metadata={
"database": db_name,
"category": task.category_id,
"task_id": task.task_id,
},
)
except Exception as e:
logger.error(f"Failed to create initial state for {task.name}: {e}")
return None
def _store_initial_state_info(
self, task: BaseTask, state_info: InitialStateInfo
) -> None:
"""Store database info in task object."""
if hasattr(task, "__dict__"):
task.database_name = state_info.state_id
task.database_url = state_info.state_url
# Store current task database for agent configuration
self._current_task_database = state_info.state_id
def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
"""Clean up task database."""
if hasattr(task, "database_name") and task.database_name:
try:
self._drop_database(task.database_name)
logger.info(f"| Dropped database: {task.database_name}")
# Remove from tracking
self.created_databases = [
db for db in self.created_databases if db != task.database_name
]
# Clear current task database
if self._current_task_database == task.database_name:
self._current_task_database = None
return True
except Exception as e:
logger.error(f"Failed to drop database {task.database_name}: {e}")
return False
return True
def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
"""Clean up a single PostgreSQL resource."""
if resource["type"] == "database":
try:
self._drop_database(resource["id"])
logger.info(f"| Dropped database: {resource['id']}")
return True
except Exception as e:
logger.error(f"| Failed to drop database {resource['id']}: {e}")
return False
return False
def _database_exists(self, db_name: str) -> bool:
"""Check if database exists."""
conn = psycopg2.connect(**self.conn_params, database="postgres")
try:
with conn.cursor() as cur:
cur.execute("SELECT 1 FROM pg_database WHERE datname = %s", (db_name,))
return cur.fetchone() is not None
finally:
conn.close()
def _create_database_from_template(self, new_db: str, template_db: str):
"""Create database from template."""
conn = psycopg2.connect(**self.conn_params, database="postgres")
conn.autocommit = True
try:
with conn.cursor() as cur:
cur.execute(
sql.SQL("""
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE datname = %s AND pid <> pg_backend_pid()
"""),
(template_db,),
)
cur.execute(
sql.SQL("CREATE DATABASE {} WITH TEMPLATE {}").format(
sql.Identifier(new_db), sql.Identifier(template_db)
)
)
finally:
conn.close()
def _create_empty_database(self, db_name: str):
"""Create empty database."""
conn = psycopg2.connect(**self.conn_params, database="postgres")
conn.autocommit = True
try:
with conn.cursor() as cur:
cur.execute(
sql.SQL("CREATE DATABASE {}").format(sql.Identifier(db_name))
)
finally:
conn.close()
def _drop_database(self, db_name: str):
"""Drop database."""
conn = psycopg2.connect(**self.conn_params, database="postgres")
conn.autocommit = True
try:
with conn.cursor() as cur:
# Terminate connections
cur.execute(
sql.SQL("""
SELECT pg_terminate_backend(pid)
FROM pg_stat_activity
WHERE datname = %s AND pid <> pg_backend_pid()
"""),
(db_name,),
)
# Drop database
cur.execute(
sql.SQL("DROP DATABASE IF EXISTS {}").format(
sql.Identifier(db_name)
)
)
finally:
conn.close()
def _run_prepare_environment(self, db_name: str, task: BaseTask):
"""Run prepare_environment.py script if it exists in the task directory."""
# Find the task directory containing prepare_environment.py
task_dir = task.task_instruction_path.parent
prepare_script = task_dir / "prepare_environment.py"
if not prepare_script.exists():
logger.debug(f"No prepare_environment.py found for task {task.name}")
return
logger.info(f"| Running prepare_environment.py for task {task.name}")
# Set up environment variables for the script
env = os.environ.copy()
env.update({
"POSTGRES_HOST": str(self.host),
"POSTGRES_PORT": str(self.port),
"POSTGRES_DATABASE": db_name,
"POSTGRES_USERNAME": self.username,
"POSTGRES_PASSWORD": self.password or "",
})
try:
# Run the prepare_environment.py script
result = subprocess.run(
[sys.executable, str(prepare_script)],
cwd=str(task_dir), # Run from task directory to access data/ folder
env=env,
capture_output=True,
text=True,
timeout=300, # 5 minute timeout
)
if result.returncode == 0:
logger.info(f"| ✓ Environment preparation completed for {task.name}")
if result.stdout.strip():
logger.debug(f"| prepare_environment.py output: {result.stdout}")
else:
logger.error(f"| ❌ Environment preparation failed for {task.name}")
logger.error(f"| Error output: {result.stderr}")
raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}")
except subprocess.TimeoutExpired:
logger.error(f"❌ Environment preparation timed out for {task.name}")
raise RuntimeError("prepare_environment.py execution timed out")
except Exception as e:
logger.error(f"❌ Failed to run prepare_environment.py for {task.name}: {e}")
raise
def _setup_task_specific_data(self, db_name: str, task: BaseTask):
"""Set up task-specific schema and data."""
conn = psycopg2.connect(**self.conn_params, database=db_name)
try:
with conn.cursor() as cur:
if task.category_id == "basic_queries":
self._setup_basic_queries_data(cur)
elif task.category_id == "data_manipulation":
self._setup_data_manipulation_data(cur)
elif task.category_id == "table_operations":
self._setup_table_operations_data(cur)
# Add more categories as needed
conn.commit()
except Exception as e:
conn.rollback()
logger.error(f"Failed to setup task data: {e}")
raise
finally:
conn.close()
def _setup_basic_queries_data(self, cursor):
"""Set up data for basic query tasks."""
cursor.execute("""
CREATE TABLE employees (
id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL,
department VARCHAR(50),
salary DECIMAL(10, 2),
hire_date DATE
);
INSERT INTO employees (name, department, salary, hire_date) VALUES
('John Doe', 'Engineering', 75000.00, '2020-01-15'),
('Jane Smith', 'Marketing', 65000.00, '2019-03-22'),
('Bob Johnson', 'Engineering', 80000.00, '2018-07-01'),
('Alice Brown', 'HR', 55000.00, '2021-02-10');
""")
def _setup_data_manipulation_data(self, cursor):
"""Set up data for data manipulation tasks."""
cursor.execute("""
CREATE TABLE products (
id SERIAL PRIMARY KEY,
name VARCHAR(100) NOT NULL,
category VARCHAR(50),
price DECIMAL(10, 2),
stock INTEGER DEFAULT 0
);
CREATE TABLE orders (
id SERIAL PRIMARY KEY,
product_id INTEGER REFERENCES products(id),
quantity INTEGER NOT NULL,
order_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
def _setup_table_operations_data(self, cursor):
"""Set up for table operation tasks."""
# Start with minimal schema that tasks will modify
cursor.execute("""
CREATE TABLE test_table (
id SERIAL PRIMARY KEY,
data VARCHAR(255)
);
""")
def _get_timestamp(self) -> str:
"""Get timestamp for unique naming."""
from datetime import datetime
return datetime.now().strftime("%Y%m%d%H%M%S")
def get_service_config_for_agent(self) -> dict:
"""Get configuration for agent execution."""
config = {
"host": self.host,
"port": self.port,
"username": self.username,
"password": self.password,
}
# If there's a current task database, include it
if hasattr(self, "_current_task_database") and self._current_task_database:
config["current_database"] = self._current_task_database
config["database_url"] = (
f"postgresql://{self.username}:{self.password}@{self.host}:{self.port}/{self._current_task_database}"
)
else:
# Fallback to default database
config["database"] = self.database
config["database_url"] = (
f"postgresql://{self.username}:{self.password}@{self.host}:{self.port}/{self.database}"
)
return config
================================================
FILE: src/mcp_services/postgres/postgres_task_manager.py
================================================
"""
PostgreSQL Task Manager for MCPMark
====================================
Manages PostgreSQL task discovery, execution, and verification.
"""
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger
logger = get_logger(__name__)
@dataclass
class PostgresTask(BaseTask):
"""PostgreSQL-specific task with database information."""
task_name: str = ""
database_name: Optional[str] = None
database_url: Optional[str] = None
expected_queries: Optional[List[str]] = None
expected_tables: Optional[List[str]] = None
class PostgresTaskManager(BaseTaskManager):
"""Manages PostgreSQL tasks for MCPMark evaluation."""
def __init__(self, tasks_root: Path = None, task_suite: str = "standard"):
"""Initialize PostgreSQL task manager.
Args:
tasks_root: Path to tasks directory
task_suite: Logical task suite (e.g., 'standard', 'easy')
"""
if tasks_root is None:
tasks_root = Path(__file__).resolve().parents[3] / "tasks"
super().__init__(
tasks_root,
mcp_service="postgres",
task_class=PostgresTask,
task_organization="file", # PostgreSQL uses file-based tasks
task_suite=task_suite,
)
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> Optional[PostgresTask]:
"""Instantiate a `PostgresTask` from the dictionary returned by `_find_task_files`."""
import json
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
final_category_id = category_id
task_id = task_files_info["task_id"]
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
return PostgresTask(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="postgres",
category_id=final_category_id,
task_id=task_id,
task_name=task_files_info["task_id"],
)
def _get_verification_command(self, task: PostgresTask) -> List[str]:
"""Get verification command with database info."""
cmd = [sys.executable, str(task.task_verification_path)]
# Pass database name as argument if available
if task.database_name:
cmd.append(task.database_name)
return cmd
def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
"""Run verification with PostgreSQL environment."""
env = os.environ.copy()
# Pass database connection info to verification script
if hasattr(task, "database_name") and task.database_name:
env["POSTGRES_DATABASE"] = task.database_name
if hasattr(task, "database_url") and task.database_url:
env["DATABASE_URL"] = task.database_url
return subprocess.run(
self._get_verification_command(task),
capture_output=True,
text=True,
timeout=300,
env=env,
)
def _format_task_instruction(self, base_instruction: str) -> str:
"""Add PostgreSQL-specific instructions."""
return (
base_instruction
+ "\n\nNote: Use PostgreSQL MCP tools to complete this task. The database connection is already configured."
)
================================================
FILE: src/mcp_services/supabase/__init__.py
================================================
"""Supabase MCP service integration for MCPMark."""
from .supabase_login_helper import SupabaseLoginHelper
from .supabase_state_manager import SupabaseStateManager
from .supabase_task_manager import SupabaseTaskManager
__all__ = [
"SupabaseLoginHelper",
"SupabaseStateManager",
"SupabaseTaskManager",
]
================================================
FILE: src/mcp_services/supabase/supabase_login_helper.py
================================================
"""
Supabase Login Helper for MCPMark
===================================
Handles configuration and validation for Supabase MCP service.
"""
import os
from typing import Dict, Any, Optional
from src.base.login_helper import BaseLoginHelper
from src.logger import get_logger
logger = get_logger(__name__)
class SupabaseLoginHelper(BaseLoginHelper):
"""Login helper for Supabase MCP service.
Validates PostgREST API URL and API key configuration.
"""
def __init__(self):
super().__init__("supabase")
def prepare_credentials(self) -> Dict[str, Any]:
"""Prepare credentials for Supabase/PostgREST connection.
Returns:
Dictionary containing api_url, api_key, and postgres connection details
"""
# Get PostgREST API configuration (from Supabase CLI)
api_url = os.getenv("SUPABASE_API_URL", "http://localhost:54321")
api_key = os.getenv("SUPABASE_API_KEY")
# Get PostgreSQL connection details (Supabase CLI defaults)
postgres_host = os.getenv("SUPABASE_DB_HOST", "localhost")
postgres_port = int(os.getenv("SUPABASE_DB_PORT", "54322"))
postgres_user = os.getenv("SUPABASE_DB_USER", "postgres")
postgres_password = os.getenv("SUPABASE_DB_PASSWORD", "postgres")
postgres_database = os.getenv("SUPABASE_DB_NAME", "postgres")
if not api_key:
logger.warning(
"SUPABASE_API_KEY not set.\n"
"Run 'supabase status' to get your anon or service_role key.\n"
"Set SUPABASE_API_KEY in your .mcp_env file."
)
# Try to get it from supabase status
api_key = self._get_key_from_supabase_status()
return {
"api_url": api_url,
"api_key": api_key or "",
"postgres_host": postgres_host,
"postgres_port": postgres_port,
"postgres_user": postgres_user,
"postgres_password": postgres_password,
"postgres_database": postgres_database,
}
def _get_key_from_supabase_status(self) -> Optional[str]:
"""Try to get anon key from supabase status command.
Returns:
Anon key if found, None otherwise
"""
import subprocess
try:
result = subprocess.run(
["supabase", "status"],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode == 0:
# Parse output for anon key
for line in result.stdout.split('\n'):
if 'anon key:' in line.lower():
# Extract the key after the colon
key = line.split(':', 1)[1].strip()
logger.info("Found anon key from 'supabase status'")
return key
except (subprocess.SubprocessError, FileNotFoundError):
logger.debug("Could not run 'supabase status' to get anon key")
return None
def test_credentials(self, credentials: Dict[str, Any]) -> bool:
"""Test if Supabase credentials are valid.
Args:
credentials: Dictionary with api_url, api_key, and postgres connection details
Returns:
True if credentials are valid
"""
import requests
import psycopg2
api_url = credentials["api_url"]
api_key = credentials.get("api_key", "")
# Test PostgreSQL connection
try:
conn_params = {
"host": credentials["postgres_host"],
"port": credentials["postgres_port"],
"user": credentials["postgres_user"],
"password": credentials["postgres_password"],
"database": credentials["postgres_database"],
}
conn = psycopg2.connect(**conn_params)
conn.close()
logger.info("✓ PostgreSQL connection successful")
except Exception as e:
logger.error(f"✗ PostgreSQL connection failed: {e}")
return False
# Test PostgREST API connection (optional - may not be running yet)
try:
headers = {}
if api_key:
headers["apikey"] = api_key
headers["Authorization"] = f"Bearer {api_key}"
response = requests.get(api_url, headers=headers, timeout=5)
# Any response (including 404, 401) means the API is reachable
logger.info(f"✓ PostgREST API reachable at {api_url} (status: {response.status_code})")
return True
except requests.exceptions.ConnectionError:
logger.warning(
f"⚠ PostgREST API not reachable at {api_url}.\n"
"Make sure PostgREST is running (e.g., docker run -p 3000:3000 postgrest/postgrest)\n"
"or use a cloud Supabase instance URL."
)
# Still return True as PostgreSQL connection works
return True
except Exception as e:
logger.warning(f"⚠ PostgREST API test failed: {e}")
# Still return True as PostgreSQL connection works
return True
def format_credentials_info(self, credentials: Dict[str, Any]) -> str:
"""Format credentials info for display.
Args:
credentials: Dictionary with connection details
Returns:
Formatted string describing the credentials
"""
api_url = credentials["api_url"]
has_api_key = bool(credentials.get("api_key"))
postgres_host = credentials["postgres_host"]
postgres_db = credentials["postgres_database"]
return (
f"Supabase Configuration:\n"
f" API URL: {api_url}\n"
f" API Key: {'✓ Configured' if has_api_key else '✗ Not set'}\n"
f" PostgreSQL: {postgres_host}/{postgres_db}"
)
================================================
FILE: src/mcp_services/supabase/supabase_state_manager.py
================================================
"""
Supabase State Manager for MCPMark
====================================
Manages database state for Supabase tasks using the same PostgreSQL backend
as Insforge, but accessed via PostgREST/Supabase MCP server.
"""
import os
import sys
import subprocess
import psycopg2
from psycopg2 import sql
from pathlib import Path
from typing import Optional, Dict, Any, List
from src.base.state_manager import BaseStateManager, InitialStateInfo
from src.base.task_manager import BaseTask
from src.logger import get_logger
logger = get_logger(__name__)
class SupabaseStateManager(BaseStateManager):
"""Manages Supabase/PostgREST database state for task evaluation.
Uses the same PostgreSQL database as Insforge but exposes it via
PostgREST API for the Supabase MCP server to access.
"""
def __init__(
self,
api_url: str,
api_key: str,
postgres_host: str = "localhost",
postgres_port: int = 54322, # Supabase CLI default port
postgres_user: str = "postgres",
postgres_password: str = "postgres",
postgres_database: str = "postgres", # Supabase CLI default database
):
"""Initialize Supabase state manager.
Args:
api_url: PostgREST API URL from Supabase CLI (default: http://localhost:54321)
api_key: API key from Supabase CLI (anon or service_role key)
postgres_host: PostgreSQL host for direct database operations
postgres_port: PostgreSQL port (Supabase CLI uses 54322)
postgres_user: PostgreSQL username
postgres_password: PostgreSQL password
postgres_database: Main PostgreSQL database name
"""
super().__init__(service_name="supabase")
self.api_url = api_url.rstrip('/')
self.api_key = api_key
# PostgreSQL connection for state management (Supabase CLI instance)
self.postgres_host = postgres_host
self.postgres_port = postgres_port
self.postgres_user = postgres_user
self.postgres_password = postgres_password
self.postgres_database = postgres_database
# Track current task context for agent configuration
self._current_task_context: Optional[Dict[str, Any]] = None
# Validate connection on initialization
try:
self._test_connection()
logger.info("Supabase state manager initialized successfully")
except Exception as e:
raise RuntimeError(f"Supabase initialization failed: {e}")
# Store baseline tables (system tables that exist before any tasks run)
self._baseline_tables = set(
(t['schema'], t['name']) for t in self._get_all_tables()
)
logger.debug(f"Stored baseline: {len(self._baseline_tables)} tables")
def _test_connection(self):
"""Test PostgreSQL connection."""
try:
conn_params = {
"host": self.postgres_host,
"port": self.postgres_port,
"user": self.postgres_user,
"password": self.postgres_password,
"database": self.postgres_database,
}
conn = psycopg2.connect(**conn_params)
conn.close()
logger.debug("PostgreSQL connection test successful")
except Exception as e:
raise RuntimeError(f"Cannot connect to PostgreSQL: {e}")
def _create_initial_state(self, task: BaseTask) -> Optional[InitialStateInfo]:
"""Create initial backend state for a task.
Restores from backup which may place tables in public or task-specific schema.
Args:
task: Task for which to create initial state
Returns:
InitialStateInfo object or None if creation failed
"""
try:
# Generate unique state ID for this task run
state_id = f"{task.category_id}_{task.task_id}_{self._get_timestamp()}"
schema_name = task.category_id
logger.info(f"| Creating initial state for Supabase task: {task.name}")
# Drop schema first (cleanup from previous runs)
self._drop_schema(schema_name)
# Get list of existing tables before restore (to track what we create)
tables_before = self._get_all_tables()
logger.info(f"| Tables before restore: {len(tables_before)}")
# Note: Don't create schema here - pg_restore will create it from the backup
# Restore from backup if backup exists (may create tables in public or task schema)
if self._restore_from_backup(schema_name):
logger.info(f"| ✓ Restored '{schema_name}' from backup")
else:
logger.info(f"| ○ No backup found for '{schema_name}'")
# Run prepare_environment.py if it exists
task_prepared = self._run_prepare_environment(task)
if not task_prepared:
logger.debug(f"| No prepare_environment.py found for task {task.name}")
# Get list of tables after restore (to track what we need to clean up)
tables_after = self._get_all_tables()
# Track ALL new tables created by the restore (compare before/after)
tables_before_set = {(t['schema'], t['name']) for t in tables_before}
created_tables = [
t for t in tables_after
if (t['schema'], t['name']) not in tables_before_set
]
logger.info(f"| Tracked {len(created_tables)} new tables for cleanup")
for t in created_tables:
logger.debug(f"| - {t['schema']}.{t['name']}")
# Track the task context including created tables
context = {
"state_id": state_id,
"category_id": task.category_id,
"task_id": task.task_id,
"task_name": task.name,
"schema": schema_name,
"created_tables": created_tables,
}
return InitialStateInfo(
state_id=state_id,
state_url=self.api_url,
metadata=context,
)
except Exception as e:
logger.error(f"Failed to create initial state for {task.name}: {e}")
return None
def _store_initial_state_info(
self, task: BaseTask, state_info: InitialStateInfo
) -> None:
"""Store backend info in task object for agent access."""
if hasattr(task, "__dict__"):
task.api_url = self.api_url
task.api_key = self.api_key
task.state_id = state_info.state_id
# Store current task context for agent configuration
self._current_task_context = state_info.metadata
def _cleanup_task_initial_state(self, task: BaseTask) -> bool:
"""Clean up task-specific resources.
Drops ALL tables created during task (both setup and agent-created)
by comparing against baseline.
Args:
task: Task whose initial state should be cleaned up
Returns:
True if cleanup successful
"""
try:
logger.info(f"| Cleaning up initial state for task: {task.name}")
if self._current_task_context:
schema_name = self._current_task_context.get("schema")
# Get ALL current tables
all_current_tables = self._get_all_tables()
# Find tables to drop: anything not in baseline
tables_to_drop = [
t for t in all_current_tables
if (t['schema'], t['name']) not in self._baseline_tables
]
logger.info(f"| Found {len(tables_to_drop)} tables to clean up (setup + agent-created)")
# Drop individual tables
for table_info in tables_to_drop:
try:
self._drop_table(table_info["schema"], table_info["name"])
logger.debug(f"| ✓ Dropped table: {table_info['schema']}.{table_info['name']}")
except Exception as e:
logger.warning(f"| Failed to drop table {table_info}: {e}")
# Drop the task schema (may be empty if all tables were in public)
if schema_name:
try:
self._drop_schema(schema_name)
logger.info(f"| ✓ Dropped schema: {schema_name}")
except Exception as e:
logger.warning(f"| Failed to drop schema {schema_name}: {e}")
# Clear task context
if self._current_task_context.get("task_name") == task.name:
self._current_task_context = None
logger.info(f"| ✓ Initial state cleanup completed for {task.name}")
return True
except Exception as e:
logger.error(f"Failed to cleanup task initial state for {task.name}: {e}")
return False
def _cleanup_single_resource(self, resource: Dict[str, Any]) -> bool:
"""Clean up a single tracked resource.
Args:
resource: Resource dictionary with type, id, and metadata
Returns:
True if cleanup successful
"""
resource_type = resource["type"]
resource_id = resource["id"]
logger.debug(f"| Cleanup for {resource_type} {resource_id} (handled by task scripts)")
return True
def _run_prepare_environment(self, task: BaseTask) -> bool:
"""Run prepare_environment.py script if it exists in the task directory.
The script should use database operations to set up required state.
Args:
task: Task for which to prepare environment
Returns:
True if script ran successfully, False if script doesn't exist
"""
task_dir = task.task_instruction_path.parent
prepare_script = task_dir / "prepare_environment.py"
if not prepare_script.exists():
logger.debug(f"No prepare_environment.py found for task {task.name}")
return False
logger.info(f"| Running prepare_environment.py for task {task.name}")
# Set up environment variables for the script
env = os.environ.copy()
env.update({
"SUPABASE_API_URL": self.api_url,
"SUPABASE_API_KEY": self.api_key,
"POSTGRES_HOST": self.postgres_host,
"POSTGRES_PORT": str(self.postgres_port),
"POSTGRES_DATABASE": self.postgres_database,
"POSTGRES_USERNAME": self.postgres_user,
"POSTGRES_PASSWORD": self.postgres_password,
})
try:
# Run the prepare_environment.py script
result = subprocess.run(
[sys.executable, str(prepare_script)],
cwd=str(task_dir), # Run from task directory
env=env,
capture_output=True,
text=True,
timeout=300, # 5 minute timeout
)
if result.returncode == 0:
logger.info(f"| ✓ Environment preparation completed for {task.name}")
if result.stdout.strip():
logger.debug(f"| prepare_environment.py output: {result.stdout}")
return True
else:
logger.error(f"| ✗ Environment preparation failed for {task.name}")
logger.error(f"| Error output: {result.stderr}")
raise RuntimeError(f"prepare_environment.py failed with exit code {result.returncode}")
except subprocess.TimeoutExpired:
logger.error(f"✗ Environment preparation timed out for {task.name}")
raise RuntimeError("prepare_environment.py execution timed out")
except Exception as e:
logger.error(f"✗ Failed to run prepare_environment.py for {task.name}: {e}")
raise
def _get_timestamp(self) -> str:
"""Get timestamp for unique naming."""
from datetime import datetime
return datetime.now().strftime("%Y%m%d%H%M%S")
def _drop_schema(self, schema_name: str) -> None:
"""Drop schema and all its contents."""
conn_params = {
"host": self.postgres_host,
"port": self.postgres_port,
"user": self.postgres_user,
"password": self.postgres_password,
"database": self.postgres_database,
}
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
try:
with conn.cursor() as cur:
cur.execute(
sql.SQL("DROP SCHEMA IF EXISTS {} CASCADE").format(
sql.Identifier(schema_name)
)
)
logger.debug(f"| Dropped schema: {schema_name}")
finally:
conn.close()
def _create_schema(self, schema_name: str) -> None:
"""Create empty schema."""
conn_params = {
"host": self.postgres_host,
"port": self.postgres_port,
"user": self.postgres_user,
"password": self.postgres_password,
"database": self.postgres_database,
}
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
try:
with conn.cursor() as cur:
cur.execute(
sql.SQL("CREATE SCHEMA {}").format(sql.Identifier(schema_name))
)
logger.debug(f"| Created schema: {schema_name}")
finally:
conn.close()
def _get_all_tables(self) -> List[Dict[str, str]]:
"""Get list of all user tables.
Returns:
List of dicts with 'schema' and 'name' keys
"""
conn_params = {
"host": self.postgres_host,
"port": self.postgres_port,
"user": self.postgres_user,
"password": self.postgres_password,
"database": self.postgres_database,
}
conn = psycopg2.connect(**conn_params)
try:
with conn.cursor() as cur:
cur.execute("""
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_type = 'BASE TABLE'
AND table_schema NOT IN ('information_schema', 'pg_catalog')
AND table_schema NOT LIKE 'pg_%'
AND table_name NOT LIKE '\\_%'
ORDER BY table_schema, table_name
""")
rows = cur.fetchall()
return [{"schema": row[0], "name": row[1]} for row in rows]
finally:
conn.close()
def _drop_table(self, schema_name: str, table_name: str) -> None:
"""Drop a specific table or materialized view."""
conn_params = {
"host": self.postgres_host,
"port": self.postgres_port,
"user": self.postgres_user,
"password": self.postgres_password,
"database": self.postgres_database,
}
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
try:
with conn.cursor() as cur:
# Try dropping as table first
cur.execute(
sql.SQL("DROP TABLE IF EXISTS {}.{} CASCADE").format(
sql.Identifier(schema_name),
sql.Identifier(table_name)
)
)
# Also try dropping as materialized view (in case agent created one)
cur.execute(
sql.SQL("DROP MATERIALIZED VIEW IF EXISTS {}.{} CASCADE").format(
sql.Identifier(schema_name),
sql.Identifier(table_name)
)
)
logger.debug(f"| Dropped table/view: {schema_name}.{table_name}")
finally:
conn.close()
def _restore_from_backup(self, category_name: str) -> bool:
"""Restore from backup file.
Tables may be restored into public schema or category-specific schema
depending on how the backup was created.
Args:
category_name: Name of category (e.g., 'employees', 'chinook', 'lego')
Returns:
True if backup was restored, False if no backup exists
"""
# Path to backup file (same as used by Insforge/Postgres)
backup_dir = Path(__file__).parent.parent.parent.parent / "postgres_state"
backup_file = backup_dir / f"{category_name}.backup"
logger.debug(f"| Looking for backup at: {backup_file}")
if not backup_file.exists():
logger.info(f"| ○ No backup file found: {backup_file}")
return False
logger.info(f"| Restoring {category_name} from backup...")
# Set up environment for pg_restore
env = os.environ.copy()
env["PGPASSWORD"] = self.postgres_password
try:
# Restore backup
result = subprocess.run(
[
"pg_restore",
"-h", self.postgres_host,
"-p", str(self.postgres_port),
"-U", self.postgres_user,
"-d", self.postgres_database,
"-v",
str(backup_file),
],
env=env,
capture_output=True,
text=True,
timeout=120, # 2 minute timeout
)
if result.returncode != 0 and "ERROR" in result.stderr:
logger.warning(f"| pg_restore had errors for {category_name}: {result.stderr}")
return False
logger.info(f"| ✓ {category_name} restored successfully")
return True
except subprocess.TimeoutExpired:
logger.error(f"| ✗ Restore timed out for {category_name}")
return False
except Exception as e:
logger.error(f"| ✗ Failed to restore {category_name}: {e}")
return False
def get_service_config_for_agent(self) -> dict:
"""Get configuration for agent execution.
This configuration is passed to the agent/MCP server so it can
connect to the Supabase/PostgREST endpoint.
Returns:
Dictionary containing API URL and API key
"""
config = {
"api_url": self.api_url,
"api_key": self.api_key,
"schema": "public", # Default schema for PostgREST
}
# Include current task context if available
if self._current_task_context:
config["task_context"] = self._current_task_context
# If task uses a specific schema, include it
if self._current_task_context.get("schema"):
config["schema"] = self._current_task_context["schema"]
return config
def set_verification_environment(self, messages_path: str = None) -> None:
"""Set environment variables needed for verification scripts.
Args:
messages_path: Optional path to messages.json file for verification
"""
os.environ["SUPABASE_API_URL"] = self.api_url
os.environ["SUPABASE_API_KEY"] = self.api_key
# Set PostgreSQL connection details for direct database verification
os.environ["POSTGRES_HOST"] = self.postgres_host
os.environ["POSTGRES_PORT"] = str(self.postgres_port)
os.environ["POSTGRES_DATABASE"] = self.postgres_database
os.environ["POSTGRES_USERNAME"] = self.postgres_user
os.environ["POSTGRES_PASSWORD"] = self.postgres_password
if messages_path:
os.environ["MCP_MESSAGES"] = str(messages_path)
logger.debug("Verification environment variables set for Supabase (including direct postgres access)")
================================================
FILE: src/mcp_services/supabase/supabase_task_manager.py
================================================
"""
Supabase Task Manager for MCPMark
===================================
Manages Supabase task discovery, execution, and verification.
Reuses Postgres tasks but accesses them via PostgREST/Supabase MCP.
"""
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.base.task_manager import BaseTask, BaseTaskManager
from src.logger import get_logger
logger = get_logger(__name__)
@dataclass
class SupabaseTask(BaseTask):
"""Supabase-specific task with API information."""
task_name: str = ""
api_url: Optional[str] = None
api_key: Optional[str] = None
class SupabaseTaskManager(BaseTaskManager):
"""Manages Supabase tasks for MCPMark evaluation.
Uses the same task structure as Postgres tasks but accessed via
PostgREST/Supabase MCP server.
"""
def __init__(self, tasks_root: Path = None):
"""Initialize Supabase task manager.
Args:
tasks_root: Path to tasks directory
"""
if tasks_root is None:
tasks_root = Path(__file__).resolve().parents[3] / "tasks"
super().__init__(
tasks_root,
mcp_service="supabase",
task_class=SupabaseTask,
task_organization="file", # Supabase uses file-based tasks (like Postgres)
)
def _create_task_from_files(
self, category_id: str, task_files_info: Dict[str, Any]
) -> Optional[SupabaseTask]:
"""Instantiate a `SupabaseTask` from the dictionary returned by `_find_task_files`."""
import json
# Check for meta.json
meta_path = task_files_info["instruction_path"].parent / "meta.json"
final_category_id = category_id
task_id = task_files_info["task_id"]
if meta_path.exists():
try:
with open(meta_path, 'r') as f:
meta_data = json.load(f)
# Use values from meta.json if available
final_category_id = meta_data.get("category_id", category_id)
task_id = meta_data.get("task_id", task_id)
except Exception as e:
logger.warning(f"Failed to load meta.json from {meta_path}: {e}")
return SupabaseTask(
task_instruction_path=task_files_info["instruction_path"],
task_verification_path=task_files_info["verification_path"],
service="supabase",
category_id=final_category_id,
task_id=task_id,
task_name=task_files_info["task_id"],
)
def _get_verification_command(self, task: SupabaseTask) -> List[str]:
"""Get verification command with Supabase API info."""
cmd = [sys.executable, str(task.task_verification_path)]
return cmd
def run_verification(self, task: BaseTask) -> subprocess.CompletedProcess:
"""Run verification with Supabase environment."""
env = os.environ.copy()
# Pass Supabase connection info to verification script
if hasattr(task, "api_url") and task.api_url:
env["SUPABASE_API_URL"] = task.api_url
if hasattr(task, "api_key") and task.api_key:
env["SUPABASE_API_KEY"] = task.api_key
return subprocess.run(
self._get_verification_command(task),
capture_output=True,
text=True,
timeout=300,
env=env,
)
def _format_task_instruction(self, base_instruction: str) -> str:
"""Add Supabase-specific instructions."""
return (
base_instruction
+ "\n\nNote: Use Supabase MCP tools (PostgREST) to complete this task. The API connection is already configured."
)
================================================
FILE: src/model_config.py
================================================
#!/usr/bin/env python3
"""
Model Configuration for MCPMark
================================
This module provides configuration management for different LLM models,
automatically detecting the required API keys and base URLs based on the model name.
"""
import os
from typing import Dict, List
from src.logger import get_logger
# Initialize logger
logger = get_logger(__name__)
class ModelConfig:
"""
Configuration container for a specific model.
It loads the necessary API key and base URL from environment variables.
"""
# Model configuration mapping
MODEL_CONFIGS = {
# OpenAI models
"gpt-4o": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-4o",
},
"gpt-4.1": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-4.1",
},
"gpt-4.1-mini": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-4.1-mini",
},
"gpt-4.1-nano": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-4.1-nano",
},
"gpt-5.2": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-5.2",
},
"gpt-5": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-5",
},
"gpt-5-mini": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-5-mini",
},
"gpt-5-nano": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/gpt-5-nano",
},
"o3": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/o3",
},
"o4-mini": {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": "openai/o4-mini",
},
"gpt-oss-120b": {
"provider": "openai",
"api_key_var": "OPENROUTER_API_KEY",
"litellm_input_model_name": "openrouter/openai/gpt-oss-120b",
},
# DeepSeek models
"deepseek-v3.2-instruct": {
"provider": "deepseek",
"api_key_var": "DEEPSEEK_API_KEY",
"litellm_input_model_name": "deepseek/deepseek-chat",
},
"deepseek-v3.2-thinking": {
"provider": "deepseek",
"api_key_var": "DEEPSEEK_API_KEY",
"litellm_input_model_name": "deepseek/deepseek-reasoner",
},
# Anthropic models
"claude-3.7-sonnet": {
"provider": "anthropic",
"api_key_var": "ANTHROPIC_API_KEY",
"litellm_input_model_name": "anthropic/claude-3-7-sonnet-20250219",
},
"claude-sonnet-4": {
"provider": "anthropic",
"api_key_var": "ANTHROPIC_API_KEY",
"litellm_input_model_name": "anthropic/claude-sonnet-4-20250514",
},
"claude-sonnet-4.5": {
"provider": "anthropic",
"api_key_var": "ANTHROPIC_API_KEY",
"litellm_input_model_name": "anthropic/claude-sonnet-4-5-20250929",
},
"claude-opus-4": {
"provider": "anthropic",
"api_key_var": "ANTHROPIC_API_KEY",
"litellm_input_model_name": "anthropic/claude-opus-4-20250514",
},
"claude-opus-4.1": {
"provider": "anthropic",
"api_key_var": "ANTHROPIC_API_KEY",
"litellm_input_model_name": "anthropic/claude-opus-4-1-20250805",
},
"claude-opus-4.5": {
"provider": "anthropic",
"api_key_var": "ANTHROPIC_API_KEY",
"litellm_input_model_name": "anthropic/claude-opus-4-5-20251101",
},
# Google models
"gemini-2.5-pro": {
"provider": "google",
"api_key_var": "GEMINI_API_KEY",
"litellm_input_model_name": "gemini/gemini-2.5-pro",
},
"gemini-2.5-flash": {
"provider": "google",
"api_key_var": "GEMINI_API_KEY",
"litellm_input_model_name": "gemini/gemini-2.5-flash",
},
"gemini-3-pro": {
"provider": "google",
"api_key_var": "GEMINI_API_KEY",
"litellm_input_model_name": "gemini/gemini-3-pro-preview",
},
# Moonshot models
"kimi-k2-0711": {
"provider": "moonshot",
"api_key_var": "MOONSHOT_API_KEY",
"litellm_input_model_name": "moonshot/kimi-k2-0711-preview",
},
"kimi-k2-0905": {
"provider": "moonshot",
"api_key_var": "MOONSHOT_API_KEY",
"litellm_input_model_name": "moonshot/kimi-k2-0905-preview",
},
"kimi-k2-thinking": {
"provider": "moonshot",
"api_key_var": "OPENROUTER_API_KEY",
"litellm_input_model_name": "openrouter/moonshotai/kimi-k2-thinking",
},
# Grok models
"grok-4": {
"provider": "xai",
"api_key_var": "GROK_API_KEY",
"litellm_input_model_name": "xai/grok-4-0709",
},
"grok-code-fast-1": {
"provider": "xai",
"api_key_var": "GROK_API_KEY",
"litellm_input_model_name": "xai/grok-code-fast-1",
},
# Qwen models
"qwen-3-coder-plus": {
"provider": "qwen",
"api_key_var": "DASHSCOPE_API_KEY",
"litellm_input_model_name": "dashscope/qwen3-coder-plus",
},
"qwen-3-max": {
"provider": "qwen",
"api_key_var": "DASHSCOPE_API_KEY",
"litellm_input_model_name": "dashscope/qwen3-max-preview",
},
# Zhipu
"glm-4.5": {
"provider": "zhipu",
"api_key_var": "OPENROUTER_API_KEY",
"litellm_input_model_name": "openrouter/z-ai/glm-4.5",
}
}
def __init__(self, model_name: str):
"""
Initializes the model configuration.
Args:
model_name: The name of the model (e.g., 'gpt-4o', 'deepseek-chat').
Raises:
ValueError: If the model is not supported or environment variables are missing.
"""
self.short_model_name = model_name
model_info = self._get_model_info(model_name)
# Load API key, base URL and LiteLLM model name from environment variables
if "base_url_var" in model_info:
self.base_url = os.getenv(model_info["base_url_var"])
else:
self.base_url = None
self.api_key = os.getenv(model_info["api_key_var"])
if not self.api_key:
raise ValueError(
f"Missing required environment variable: {model_info['api_key_var']}"
)
self.litellm_input_model_name = model_info.get("litellm_input_model_name", model_name)
def _get_model_info(self, model_name: str) -> Dict[str, str]:
"""
Retrieves the configuration details for a given model name.
For unsupported models, defaults to using OPENAI_BASE_URL and OPENAI_API_KEY.
"""
if model_name not in self.MODEL_CONFIGS:
logger.warning(
f"Model '{model_name}' not in supported list. Using default OpenAI configuration."
)
# Return default configuration for unsupported models
return {
"provider": "openai",
"api_key_var": "OPENAI_API_KEY",
"litellm_input_model_name": model_name,
}
return self.MODEL_CONFIGS[model_name]
@classmethod
def get_supported_models(cls) -> List[str]:
"""Returns a list of all supported model names."""
return list(cls.MODEL_CONFIGS.keys())
def main():
"""Example usage of the ModelConfig class."""
logger.info("Supported models: %s", ModelConfig.get_supported_models())
try:
# Example: Create a model config for DeepSeek
model_config = ModelConfig("deepseek-chat")
logger.info("✅ DeepSeek model config created successfully.")
logger.info("Short model name: %s", model_config.short_model_name)
logger.info("API key loaded: %s", bool(model_config.api_key))
except ValueError as e:
logger.error("⚠️ Configuration error: %s", e)
if __name__ == "__main__":
main()
================================================
FILE: src/results_reporter.py
================================================
#!/usr/bin/env python3
"""
Results Reporter for MCPMark Evaluation Pipeline
================================================
This module provides utilities for saving evaluation results in a structured format.
"""
import json
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
from src.logger import get_logger
# Initialize logger
logger = get_logger(__name__)
@dataclass
class TaskResult:
"""
Represents the result of a single task evaluation.
Attributes:
task_name: The full name of the task (e.g., "category_id__task_id").
success: Whether the task completed successfully.
category_id: The task category ID.
task_id: The task identifier (number or slug).
error_message: Error message from agent execution if it failed.
verification_error: Error message from verification if it failed.
verification_output: Captured stdout from verification script.
model_output: Agent conversation trajectory (messages).
token_usage: Token usage statistics.
turn_count: Number of turns taken during task execution.
agent_execution_time: Time for Step 2 (agent execution) in seconds.
task_execution_time: Total time for Steps 1-4 in seconds.
"""
task_name: str
success: bool
category_id: Optional[str] = None
task_id: Optional[str] = None
error_message: Optional[str] = None # Agent execution error
verification_error: Optional[str] = None # Verification error (separate from agent error)
verification_output: Optional[str] = None # Verification stdout/stderr
model_output: Optional[Any] = None # Agent conversation trajectory
token_usage: Optional[Dict[str, int]] = None # Token usage statistics
turn_count: Optional[int] = None # Number of turns taken during task execution
agent_execution_time: float = 0.0 # Time for Step 2 (agent execution) in seconds
task_execution_time: float = 0.0 # Total time for Steps 1-4 in seconds
@property
def status(self) -> str:
"""Returns the status of the task as 'PASS' or 'FAIL'."""
return "PASS" if self.success else "FAIL"
@dataclass
class EvaluationReport:
"""Represents a complete evaluation report for a model."""
model_name: str
model_config: Dict[str, Any]
total_tasks: int
successful_tasks: int
failed_tasks: int
task_results: List[TaskResult]
tasks_filter: Optional[str] = None
@property
def success_rate(self) -> float:
"""Calculates the overall success rate as a percentage."""
if self.total_tasks == 0:
return 0.0
return (self.successful_tasks / self.total_tasks) * 100
@property
def total_input_tokens(self) -> int:
"""Calculate total input tokens across all tasks."""
total = 0
for result in self.task_results:
if result.token_usage:
total += (result.token_usage.get("input_tokens") or 0)
return total
@property
def total_output_tokens(self) -> int:
"""Calculate total output tokens across all tasks."""
total = 0
for result in self.task_results:
if result.token_usage:
total += (result.token_usage.get("output_tokens") or 0)
return total
@property
def total_tokens(self) -> int:
"""Calculate total tokens across all tasks."""
total = 0
for result in self.task_results:
if result.token_usage:
total += (result.token_usage.get("total_tokens") or 0)
return total
@property
def total_reasoning_tokens(self) -> int:
"""Calculate total reasoning tokens across all tasks."""
total = 0
for result in self.task_results:
if result.token_usage:
total += (result.token_usage.get("reasoning_tokens") or 0)
return total
@property
def avg_input_tokens(self) -> float:
"""Calculate average input tokens per task."""
if self.total_tasks == 0:
return 0.0
return self.total_input_tokens / self.total_tasks
@property
def avg_output_tokens(self) -> float:
"""Calculate average output tokens per task."""
if self.total_tasks == 0:
return 0.0
return self.total_output_tokens / self.total_tasks
@property
def avg_total_tokens(self) -> float:
"""Calculate average total tokens per task."""
if self.total_tasks == 0:
return 0.0
return self.total_tokens / self.total_tasks
@property
def avg_reasoning_tokens(self) -> float:
"""Calculate average reasoning tokens per task."""
if self.total_tasks == 0:
return 0.0
return self.total_reasoning_tokens / self.total_tasks
@property
def total_task_execution_time(self) -> float:
"""Calculates the total task execution time from sum of all task execution times."""
# Use sum of individual task execution times instead of pipeline wall clock time
# This ensures resume functionality shows correct total time
return sum(task.task_execution_time for task in self.task_results)
@property
def total_agent_execution_time(self) -> float:
"""Calculates the total agent execution time (Step 2) across all tasks."""
return sum(task.agent_execution_time for task in self.task_results)
def get_category_stats(self) -> Dict[str, Dict[str, Any]]:
"""
Calculates and returns success statistics grouped by task category.
"""
category_stats = {}
for result in self.task_results:
category = result.category_id or "Uncategorized"
if category not in category_stats:
category_stats[category] = {
"total": 0,
"successful": 0,
"failed": 0,
"success_rate": 0.0,
"avg_execution_time": 0.0,
"avg_agent_execution_time": 0.0,
"total_input_tokens": 0,
"total_output_tokens": 0,
"total_tokens": 0,
"total_reasoning_tokens": 0,
"avg_input_tokens": 0.0,
"avg_output_tokens": 0.0,
"avg_total_tokens": 0.0,
"avg_reasoning_tokens": 0.0,
"total_turns": 0,
"avg_turns": 0.0,
}
category_stats[category]["total"] += 1
if result.success:
category_stats[category]["successful"] += 1
else:
category_stats[category]["failed"] += 1
# Add token and turn usage
if result.token_usage:
category_stats[category]["total_input_tokens"] += (
result.token_usage.get("input_tokens") or 0
)
category_stats[category]["total_output_tokens"] += (
result.token_usage.get("output_tokens") or 0
)
category_stats[category]["total_tokens"] += (
result.token_usage.get("total_tokens") or 0
)
category_stats[category]["total_reasoning_tokens"] += result.token_usage.get(
"reasoning_tokens", 0
) or 0
# Accumulate turns
if result.turn_count is not None:
category_stats[category]["total_turns"] += result.turn_count
# Calculate derived metrics like success rate and average time
for category, stats in category_stats.items():
if stats["total"] > 0:
stats["success_rate"] = (stats["successful"] / stats["total"]) * 100
category_results = [
r
for r in self.task_results
if (r.category_id or "Uncategorized") == category
]
total_time = sum(r.task_execution_time for r in category_results)
stats["avg_execution_time"] = total_time / len(category_results)
# Add agent execution time stats
total_agent_time = sum(r.agent_execution_time for r in category_results)
stats["avg_agent_execution_time"] = total_agent_time / len(category_results)
# Calculate average tokens and turns
stats["avg_input_tokens"] = stats["total_input_tokens"] / stats["total"]
stats["avg_output_tokens"] = (
stats["total_output_tokens"] / stats["total"]
)
stats["avg_total_tokens"] = stats["total_tokens"] / stats["total"]
stats["avg_reasoning_tokens"] = stats["total_reasoning_tokens"] / stats["total"]
stats["avg_turns"] = (
stats["total_turns"] / stats["total"] if stats["total"] > 0 else 0
)
return category_stats
class ResultsReporter:
"""Handles saving evaluation results in structured formats."""
def __init__(self):
"""Initialize the results reporter."""
pass
def save_messages_json(self, messages: Any, output_path: Path) -> Path:
"""Saves the conversation messages/trajectory as messages.json."""
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as f:
json.dump(messages, f, indent=2, ensure_ascii=False)
return output_path
def save_meta_json(
self,
task_result: TaskResult,
model_config: Dict[str, Any],
start_time: datetime,
end_time: datetime,
output_path: Path,
) -> Path:
"""Saves task metadata (excluding messages) as meta.json."""
output_path.parent.mkdir(parents=True, exist_ok=True)
meta_data = {
"task_name": task_result.task_name,
"model_name": model_config.get("model_name", "unknown"),
"litellm_run_model_name": model_config.get("litellm_run_model_name"),
"reasoning_effort": model_config.get("reasoning_effort"),
"mcp": model_config.get("mcp_service", "unknown"),
"timeout": model_config.get("timeout", 300),
"time": {"start": start_time.isoformat(), "end": end_time.isoformat()},
"agent_execution_time": task_result.agent_execution_time,
"task_execution_time": task_result.task_execution_time,
"execution_result": {
"success": task_result.success,
"error_message": task_result.error_message,
"verification_error": task_result.verification_error,
"verification_output": task_result.verification_output,
},
"token_usage": task_result.token_usage or {},
"turn_count": task_result.turn_count,
}
with output_path.open("w", encoding="utf-8") as f:
json.dump(meta_data, f, indent=2, ensure_ascii=False)
return output_path
def save_model_summary(self, report: EvaluationReport, output_path: Path) -> Path:
"""Saves a concise model-level summary."""
output_path.parent.mkdir(parents=True, exist_ok=True)
category_stats = report.get_category_stats()
# Aggregate turn counts using category_stats
total_turns = sum(stats["total_turns"] for stats in category_stats.values())
avg_turns = total_turns / report.total_tasks if report.total_tasks > 0 else 0
summary = {
"model_name": report.model_name,
"model_config": report.model_config,
"total_tasks": report.total_tasks,
"successful_tasks": report.successful_tasks,
"failed_tasks": report.failed_tasks,
"success_rate": round(report.success_rate, 2),
"total_task_execution_time": report.total_task_execution_time,
"average_task_execution_time": report.total_task_execution_time / report.total_tasks
if report.total_tasks > 0
else 0,
"total_agent_execution_time": report.total_agent_execution_time,
"average_agent_execution_time": report.total_agent_execution_time / report.total_tasks
if report.total_tasks > 0
else 0,
"token_usage": {
"total_input_tokens": report.total_input_tokens,
"total_output_tokens": report.total_output_tokens,
"total_tokens": report.total_tokens,
"total_reasoning_tokens": report.total_reasoning_tokens,
"avg_input_tokens": round(report.avg_input_tokens, 2),
"avg_output_tokens": round(report.avg_output_tokens, 2),
"avg_total_tokens": round(report.avg_total_tokens, 2),
"avg_reasoning_tokens": round(report.avg_reasoning_tokens, 2),
},
"turn_usage": {
"total_turns": total_turns,
"avg_turns": round(avg_turns, 2),
},
"category_breakdown": {
category: {
"total": stats["total"],
"success_rate": round(stats["success_rate"], 2),
"avg_time": round(stats["avg_execution_time"], 2),
"token_usage": {
"total_input": stats["total_input_tokens"],
"total_output": stats["total_output_tokens"],
"total": stats["total_tokens"],
"total_reasoning": stats["total_reasoning_tokens"],
"avg_input": round(stats["avg_input_tokens"], 2),
"avg_output": round(stats["avg_output_tokens"], 2),
"avg_total": round(stats["avg_total_tokens"], 2),
"avg_reasoning": round(stats["avg_reasoning_tokens"], 2),
},
"turn_usage": {
"total_turns": stats["total_turns"],
"avg_turns": round(stats["avg_turns"], 2),
},
}
for category, stats in category_stats.items()
},
}
with output_path.open("w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
return output_path
================================================
FILE: src/services.py
================================================
"""
Service Definitions for MCPMark
================================
Single source of truth for all MCP service configurations.
Adding a new service only requires modifying this file.
Note: Environment variables are already loaded from .mcp_env when the app starts,
so we can reference them directly via the config system.
MCP server creation is now handled entirely within src.agent.MCPAgent; therefore,
the legacy "mcp_server" and "eval_config" entries in each service definition are
deprecated and set to None for backward-compatibility.
"""
# Service definitions
SERVICES = {
"notion": {
"config_schema": {
"source_api_key": {
"env_var": "SOURCE_NOTION_API_KEY",
"required": True,
"description": "Notion API key for source hub with templates",
},
"eval_api_key": {
"env_var": "EVAL_NOTION_API_KEY",
"required": True,
"description": "Notion API key for evaluation hub",
},
"source_parent_page_title": {
"env_var": "SOURCE_PARENT_PAGE_TITLE",
"default": "MCPMark Source Hub",
"required": False,
"description": "Title of the source hub page that contains all initial states",
},
"eval_parent_page_title": {
"env_var": "EVAL_PARENT_PAGE_TITLE",
"required": True,
"description": "Title of the parent page in evaluation workspace",
},
"playwright_headless": {
"env_var": "PLAYWRIGHT_HEADLESS",
"default": True,
"required": False,
"description": "Run browser in headless mode",
"transform": "bool", # Will be handled by GenericConfigSchema
},
"playwright_browser": {
"env_var": "PLAYWRIGHT_BROWSER",
"default": "firefox",
"required": False,
"description": "Browser to use for Playwright",
"validator": "in:chromium,firefox,webkit", # Simple validator syntax
},
},
"components": {
"task_manager": "src.mcp_services.notion.notion_task_manager.NotionTaskManager",
"state_manager": "src.mcp_services.notion.notion_state_manager.NotionStateManager",
"login_helper": "src.mcp_services.notion.notion_login_helper.NotionLoginHelper",
},
"config_mapping": {
# Maps config schema keys to class constructor parameters
"state_manager": {
"source_notion_key": "source_api_key",
"eval_notion_key": "eval_api_key",
"headless": "playwright_headless",
"browser": "playwright_browser",
"source_parent_page_title": "source_parent_page_title",
"eval_parent_page_title": "eval_parent_page_title",
},
"login_helper": {
"headless": "playwright_headless",
"browser": "playwright_browser",
},
},
# MCP server is now instantiated dynamically in MCPAgent; kept for backward
# compatibility but set to None to indicate deprecation.
"mcp_server": None,
"eval_config": None,
},
"github": {
"config_schema": {
"github_tokens": {
"env_var": "GITHUB_TOKENS",
"required": True,
"description": "GitHub personal access token(s) - comma-separated for round-robin",
"transform": "list", # Will split by comma
},
# Evaluation organisation / user that hosts ephemeral test repositories
"eval_org": {
"env_var": "GITHUB_EVAL_ORG",
"default": "mcpleague-eval",
"required": False,
"description": "Evaluation organisation or user for creating temporary test repositories",
},
# (source_org removed – template repos now imported from local files)
},
"components": {
"task_manager": "src.mcp_services.github.github_task_manager.GitHubTaskManager",
"state_manager": "src.mcp_services.github.github_state_manager.GitHubStateManager",
"login_helper": "src.mcp_services.github.github_login_helper.GitHubLoginHelper",
},
"config_mapping": {
"state_manager": {
"github_token": "github_tokens",
"eval_org": "eval_org",
},
"login_helper": {
# Login helper needs a single token, we'll use the first one
"token": "github_tokens",
},
},
"mcp_server": None,
"eval_config": None,
},
"filesystem": {
"config_schema": {
"test_root": {
"env_var": "FILESYSTEM_TEST_ROOT",
"default": None,
"required": False,
"description": "Root directory for filesystem tests",
"transform": "path", # Convert to Path object
},
"cleanup_on_exit": {
"env_var": "FILESYSTEM_CLEANUP",
"default": True,
"required": False,
"description": "Clean up test directories after tasks",
"transform": "bool",
},
},
"components": {
"task_manager": "src.mcp_services.filesystem.filesystem_task_manager.FilesystemTaskManager",
"state_manager": "src.mcp_services.filesystem.filesystem_state_manager.FilesystemStateManager",
"login_helper": "src.mcp_services.filesystem.filesystem_login_helper.FilesystemLoginHelper",
},
"config_mapping": {
"state_manager": {
"test_root": "test_root",
"cleanup_on_exit": "cleanup_on_exit",
}
},
"mcp_server": None,
"eval_config": None,
},
"playwright": {
"config_schema": {
"browser": {
"env_var": "PLAYWRIGHT_BROWSER",
"default": "chromium",
"required": False,
"description": "Browser to use (chromium, firefox, webkit)",
"validator": "in:chromium,firefox,webkit",
},
"headless": {
"env_var": "PLAYWRIGHT_HEADLESS",
"default": True,
"required": False,
"description": "Run browser in headless mode",
"transform": "bool",
},
"network_origins": {
"env_var": "PLAYWRIGHT_NETWORK_ORIGINS",
"default": "*",
"required": False,
"description": "Allowed network origins (comma-separated or *)",
},
"user_profile": {
"env_var": "PLAYWRIGHT_USER_PROFILE",
"default": "isolated",
"required": False,
"description": "User profile type (isolated or persistent)",
"validator": "in:isolated,persistent",
},
"viewport_width": {
"env_var": "PLAYWRIGHT_VIEWPORT_WIDTH",
"default": 1280,
"required": False,
"description": "Browser viewport width",
"transform": "int",
},
"viewport_height": {
"env_var": "PLAYWRIGHT_VIEWPORT_HEIGHT",
"default": 720,
"required": False,
"description": "Browser viewport height",
"transform": "int",
},
},
"components": {
"task_manager": "src.mcp_services.playwright.playwright_task_manager.PlaywrightTaskManager",
"state_manager": "src.mcp_services.playwright.playwright_state_manager.PlaywrightStateManager",
"login_helper": "src.mcp_services.playwright.playwright_login_helper.PlaywrightLoginHelper",
},
"config_mapping": {
"state_manager": {
"browser": "browser",
"headless": "headless",
"network_origins": "network_origins",
"user_profile": "user_profile",
"viewport_width": "viewport_width",
"viewport_height": "viewport_height",
},
"login_helper": {
"browser": "browser",
"headless": "headless",
},
},
"mcp_server": None,
"eval_config": None,
},
"postgres": {
"config_schema": {
"host": {
"env_var": "POSTGRES_HOST",
"default": "localhost",
"required": False,
"description": "PostgreSQL server host",
},
"port": {
"env_var": "POSTGRES_PORT",
"default": 5432,
"required": False,
"description": "PostgreSQL server port",
"transform": "int",
"validator": "port", # Validates port range 1-65535
},
"database": {
"env_var": "POSTGRES_DATABASE",
"default": "postgres",
"required": False,
"description": "PostgreSQL database name",
},
"username": {
"env_var": "POSTGRES_USERNAME",
"default": "postgres",
"required": False,
"description": "PostgreSQL username",
},
"password": {
"env_var": "POSTGRES_PASSWORD",
"default": "password",
"required": False,
"description": "PostgreSQL password",
},
},
"components": {
"task_manager": "src.mcp_services.postgres.postgres_task_manager.PostgresTaskManager",
"state_manager": "src.mcp_services.postgres.postgres_state_manager.PostgresStateManager",
"login_helper": "src.mcp_services.postgres.postgres_login_helper.PostgresLoginHelper",
},
"config_mapping": {
"state_manager": {
"host": "host",
"port": "port",
"database": "database",
"username": "username",
"password": "password",
},
"login_helper": {
"host": "host",
"port": "port",
"database": "database",
"username": "username",
"password": "password",
},
},
"mcp_server": None,
"eval_config": None,
},
"insforge": {
"config_schema": {
"api_key": {
"env_var": "INSFORGE_API_KEY",
"required": True,
"description": "Insforge backend API key for authentication",
},
"backend_url": {
"env_var": "INSFORGE_BACKEND_URL",
"required": True,
"description": "Insforge backend URL (e.g., https://your-app.insforge.app)",
},
},
"components": {
"task_manager": "src.mcp_services.insforge.insforge_task_manager.InsforgeTaskManager",
"state_manager": "src.mcp_services.insforge.insforge_state_manager.InsforgeStateManager",
"login_helper": "src.mcp_services.insforge.insforge_login_helper.InsforgeLoginHelper",
},
"config_mapping": {
"state_manager": {
"api_key": "api_key",
"backend_url": "backend_url",
},
"login_helper": {
"api_key": "api_key",
"backend_url": "backend_url",
},
},
"mcp_server": None,
"eval_config": None,
},
"supabase": {
"config_schema": {
"api_url": {
"env_var": "SUPABASE_API_URL",
"required": False,
"description": "Supabase PostgREST API URL (default: http://localhost:54321 from CLI)",
"default": "http://localhost:54321",
},
"api_key": {
"env_var": "SUPABASE_API_KEY",
"required": False,
"description": "Supabase API key (anon or service_role key from 'supabase status')",
},
"postgres_host": {
"env_var": "SUPABASE_DB_HOST",
"required": False,
"description": "PostgreSQL host for Supabase CLI instance",
"default": "localhost",
},
"postgres_port": {
"env_var": "SUPABASE_DB_PORT",
"required": False,
"description": "PostgreSQL port for Supabase CLI instance (default: 54322)",
"default": 54322,
},
"postgres_user": {
"env_var": "SUPABASE_DB_USER",
"required": False,
"description": "PostgreSQL username",
"default": "postgres",
},
"postgres_password": {
"env_var": "SUPABASE_DB_PASSWORD",
"required": False,
"description": "PostgreSQL password",
"default": "postgres",
},
"postgres_database": {
"env_var": "SUPABASE_DB_NAME",
"required": False,
"description": "PostgreSQL database name",
"default": "postgres",
},
},
"components": {
"task_manager": "src.mcp_services.supabase.supabase_task_manager.SupabaseTaskManager",
"state_manager": "src.mcp_services.supabase.supabase_state_manager.SupabaseStateManager",
"login_helper": "src.mcp_services.supabase.supabase_login_helper.SupabaseLoginHelper",
},
"config_mapping": {
"state_manager": {
"api_url": "api_url",
"api_key": "api_key",
"postgres_host": "postgres_host",
"postgres_port": "postgres_port",
"postgres_user": "postgres_user",
"postgres_password": "postgres_password",
"postgres_database": "postgres_database",
},
"login_helper": {},
},
"mcp_server": None,
"eval_config": None,
},
"playwright_webarena": {
"config_schema": {
"browser": {
"env_var": "PLAYWRIGHT_BROWSER",
"default": "chromium",
"required": False,
"description": "Browser to use (chromium, firefox, webkit)",
"validator": "in:chromium,firefox,webkit",
},
"headless": {
"env_var": "PLAYWRIGHT_HEADLESS",
"default": True,
"required": False,
"description": "Run browser in headless mode",
"transform": "bool",
},
"network_origins": {
"env_var": "PLAYWRIGHT_NETWORK_ORIGINS",
"default": "*",
"required": False,
"description": "Allowed network origins (comma-separated or *)",
},
"user_profile": {
"env_var": "PLAYWRIGHT_USER_PROFILE",
"default": "isolated",
"required": False,
"description": "User profile type (isolated or persistent)",
"validator": "in:isolated,persistent",
},
"viewport_width": {
"env_var": "PLAYWRIGHT_VIEWPORT_WIDTH",
"default": 1280,
"required": False,
"description": "Browser viewport width",
"transform": "int",
},
"viewport_height": {
"env_var": "PLAYWRIGHT_VIEWPORT_HEIGHT",
"default": 720,
"required": False,
"description": "Browser viewport height",
"transform": "int",
},
"skip_cleanup": {
"env_var": "PLAYWRIGHT_WEBARENA_SKIP_CLEANUP",
"default": False,
"required": False,
"description": "Skip Docker container cleanup for debugging",
"transform": "bool",
},
},
"components": {
"task_manager": "src.mcp_services.playwright_webarena.playwright_task_manager.PlaywrightTaskManager",
"state_manager": "src.mcp_services.playwright_webarena.playwright_state_manager.PlaywrightStateManager",
"login_helper": "src.mcp_services.playwright_webarena.playwright_login_helper.PlaywrightLoginHelper",
},
"config_mapping": {
"state_manager": {
"browser": "browser",
"headless": "headless",
"network_origins": "network_origins",
"user_profile": "user_profile",
"viewport_width": "viewport_width",
"viewport_height": "viewport_height",
"skip_cleanup": "skip_cleanup",
},
"login_helper": {
"browser": "browser",
"headless": "headless",
},
"task_manager": {},
},
"mcp_server": None,
"eval_config": None,
},
}
def get_service_definition(service_name: str) -> dict:
"""Get MCP service definition by name."""
if service_name not in SERVICES:
raise ValueError(f"Unknown MCP service: {service_name}")
return SERVICES[service_name]
def get_supported_mcp_services() -> list:
"""Get list of implemented MCP services."""
return [
name
for name, config in SERVICES.items()
if config["components"]["task_manager"] is not None
]
================================================
FILE: tasks/__init__.py
================================================
================================================
FILE: tasks/filesystem/easy/.gitkeep
================================================
================================================
FILE: tasks/filesystem/easy/file_context/file_splitting/description.md
================================================
# File Splitting Task
## 📋 Task Description
You need to split a large text file into multiple smaller files with equal character counts. The task involves creating a new directory and splitting the content into exactly 3 files.
## 🎯 Task Objectives
1. **Create a new directory** named `split` in the test directory
2. **Split the file** `large_file.txt` into exactly 3 files with **similar** character counts (maximum character difference of 100 between any two files)
3. **Name the files** as `split_01.txt`, `split_02.txt`, `split_03.txt` in the `split` directory
================================================
FILE: tasks/filesystem/easy/file_context/file_splitting/meta.json
================================================
{
"task_id": "file_splitting",
"task_name": "File Splitting",
"category_id": "file_context",
"category_name": "File Context",
"description": "Split large_file.txt into three nearly equal chunks stored as split_01.txt-split_03.txt inside a new split directory.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content transformation",
"file automation"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_context/\n \u251c\u2500\u2500 file_01.txt\n \u251c\u2500\u2500 file_02.txt\n \u251c\u2500\u2500 file_03.txt\n \u251c\u2500\u2500 file_04.txt\n \u251c\u2500\u2500 file_05.txt\n \u251c\u2500\u2500 file_06.txt\n \u251c\u2500\u2500 file_07.txt\n \u251c\u2500\u2500 file_08.txt\n \u251c\u2500\u2500 file_09.txt\n \u251c\u2500\u2500 file_10.txt\n \u251c\u2500\u2500 file_11.txt\n \u251c\u2500\u2500 file_12.txt\n \u251c\u2500\u2500 file_13.txt\n \u251c\u2500\u2500 file_14.txt\n \u251c\u2500\u2500 file_15.txt\n \u251c\u2500\u2500 file_16.txt\n \u251c\u2500\u2500 file_17.txt\n \u251c\u2500\u2500 file_18.txt\n \u251c\u2500\u2500 file_19.txt\n \u251c\u2500\u2500 file_20.txt\n \u2514\u2500\u2500 large_file.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/file_context/file_splitting/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Splitting Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_split_directory_exists(test_dir: Path) -> bool:
"""Verify that the split directory exists."""
split_dir = test_dir / "split"
if not split_dir.exists():
print("❌ Directory 'split' not found")
return False
if not split_dir.is_dir():
print("❌ 'split' exists but is not a directory")
return False
print("✅ Split directory found")
return True
def verify_all_split_files_exist(test_dir: Path) -> bool:
"""Verify that all 3 split files exist with correct names."""
split_dir = test_dir / "split"
expected_files = [f"split_{i:02d}.txt" for i in range(1, 4)]
missing_files = []
for filename in expected_files:
file_path = split_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing files: {missing_files}")
return False
print("✅ All 3 split files exist with correct names")
return True
def verify_similar_file_lengths(test_dir: Path) -> bool:
"""Verify that all split files have similar character counts (within 30 characters difference)."""
split_dir = test_dir / "split"
file_lengths = []
for i in range(1, 4):
filename = f"split_{i:02d}.txt"
file_path = split_dir / filename
try:
content = file_path.read_text()
file_lengths.append(len(content))
except Exception as e:
print(f"❌ Error reading {filename}: {e}")
return False
# Check if all lengths are within 30 characters of each other
min_length = min(file_lengths)
max_length = max(file_lengths)
length_difference = max_length - min_length
if length_difference > 100:
print(f"❌ File lengths differ by more than 30 characters: {length_difference}")
print(f" Min length: {min_length}, Max length: {max_length}")
print(f" All lengths: {file_lengths}")
return False
print(f"✅ All files have similar lengths (difference: {length_difference} characters)")
print(f" Min: {min_length}, Max: {max_length}")
return True
def verify_content_integrity(test_dir: Path) -> bool:
"""Verify that concatenated split files equal the original file."""
split_dir = test_dir / "split"
original_file = test_dir / "large_file.txt"
# Read original content
try:
original_content = original_file.read_text()
except Exception as e:
print(f"❌ Error reading original file: {e}")
return False
# Concatenate all split files
concatenated_content = ""
for i in range(1, 4):
filename = f"split_{i:02d}.txt"
file_path = split_dir / filename
try:
content = file_path.read_text()
concatenated_content += content
except Exception as e:
print(f"❌ Error reading {filename}: {e}")
return False
# Compare content
if concatenated_content != original_content:
print("❌ Concatenated content does not match original file")
print(f" Original length: {len(original_content)}")
print(f" Concatenated length: {len(concatenated_content)}")
return False
print("✅ Concatenated content matches original file exactly")
return True
def verify_no_extra_files(test_dir: Path) -> bool:
"""Verify that no extra files exist in the split directory."""
split_dir = test_dir / "split"
expected_files = {f"split_{i:02d}.txt" for i in range(1, 4)}
actual_files = {f.name for f in split_dir.iterdir() if f.is_file()}
extra_files = actual_files - expected_files
if extra_files:
print(f"❌ Extra files found in split directory: {extra_files}")
return False
print("✅ No extra files in split directory")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying File Splitting Task...")
# Define verification steps
verification_steps = [
("Split Directory Exists", verify_split_directory_exists),
("All Split Files Exist", verify_all_split_files_exist),
("Similar File Lengths", verify_similar_file_lengths),
("Content Integrity", verify_content_integrity),
("No Extra Files", verify_no_extra_files),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ File splitting task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/file_context/pattern_matching/description.md
================================================
# File Filtering Task: Find Files with Common Substring
## 📋 Task Description
Your task is to find all files that contain a substring of 30 or more characters that also appears in `large_file.txt`. **You are not allowed to use python code.**
## 🎯 Task Objectives
1. **Read the reference file** `large_file.txt` to understand its content
2. **Examine each file** from file_01.txt to file_20.txt
3. **Find files** that contain a substring of 30 or more characters that matches a substring in `large_file.txt`
4. **Create a file `answer.txt`** and write the results to it with the following format:
- One line per matching file
- Format: `filename.txt`
- Do not add any things else other than `filename.txt.`
================================================
FILE: tasks/filesystem/easy/file_context/pattern_matching/meta.json
================================================
{
"task_id": "pattern_matching",
"task_name": "Pattern Matching",
"category_id": "file_context",
"category_name": "File Context",
"description": "Scan file_01.txt through file_20.txt for any 30+ character substring that also appears in large_file.txt and list each matching filename in answer.txt.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"pattern analysis",
"search and filtering"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_context/\n \u251c\u2500\u2500 file_01.txt\n \u251c\u2500\u2500 file_02.txt\n \u251c\u2500\u2500 file_03.txt\n \u251c\u2500\u2500 file_04.txt\n \u251c\u2500\u2500 file_05.txt\n \u251c\u2500\u2500 file_06.txt\n \u251c\u2500\u2500 file_07.txt\n \u251c\u2500\u2500 file_08.txt\n \u251c\u2500\u2500 file_09.txt\n \u251c\u2500\u2500 file_10.txt\n \u251c\u2500\u2500 file_11.txt\n \u251c\u2500\u2500 file_12.txt\n \u251c\u2500\u2500 file_13.txt\n \u251c\u2500\u2500 file_14.txt\n \u251c\u2500\u2500 file_15.txt\n \u251c\u2500\u2500 file_16.txt\n \u251c\u2500\u2500 file_17.txt\n \u251c\u2500\u2500 file_18.txt\n \u251c\u2500\u2500 file_19.txt\n \u251c\u2500\u2500 file_20.txt\n \u2514\u2500\u2500 large_file.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/file_context/pattern_matching/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Filtering Task: Find Files with Common Substring
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_answer_file_exists(test_dir: Path) -> bool:
"""Verify that the answer.txt file exists."""
answer_file = test_dir / "answer.txt"
if not answer_file.exists():
print("❌ File 'answer.txt' not found")
return False
print("✅ Answer file found")
return True
def verify_answer_format(test_dir: Path) -> bool:
"""Verify that the answer file has the correct format."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# If file is empty, that's acceptable (no matches found)
if not content:
print("✅ Answer file is empty (no matches found)")
return True
lines = content.split('\n')
for i, line in enumerate(lines, 1):
line = line.strip()
if not line:
continue
# Check format: just filename.txt
if not line.endswith('.txt') or not line.startswith('file_'):
print(f"❌ Line {i} has incorrect format: {line}")
print(" Expected format: filename.txt")
return False
print("✅ Answer format is correct")
return True
except Exception as e:
print(f"❌ Error reading answer file: {e}")
return False
def find_30_plus_char_matches(test_dir: Path) -> set:
"""Find all files that have 30+ character substring matches with large_file.txt."""
large_file = test_dir / "large_file.txt"
if not large_file.exists():
print("❌ large_file.txt not found")
return set()
large_content = large_file.read_text()
matching_files = set()
# Check each file from file_01.txt to file_20.txt
for i in range(1, 21):
filename = f"file_{i:02d}.txt"
file_path = test_dir / filename
if not file_path.exists():
continue
file_content = file_path.read_text()
# Check if there's a substring of 30+ characters that matches
has_match = False
for start_pos in range(len(file_content)):
for end_pos in range(start_pos + 30, len(file_content) + 1):
substring = file_content[start_pos:end_pos]
if substring in large_content:
has_match = True
break
if has_match:
break
if has_match:
matching_files.add(filename)
return matching_files
def verify_matches_are_correct(test_dir: Path) -> bool:
"""Verify that the files listed in answer.txt actually have 30+ character matches."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# If no content, check if there should actually be no matches
if not content:
expected_matches = find_30_plus_char_matches(test_dir)
if expected_matches:
print("❌ Answer file is empty but matches should exist")
for filename in expected_matches:
print(f" Expected: {filename}")
return False
else:
print("✅ No matches found (correct)")
return True
# Parse answer file
answer_files = set()
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
answer_files.add(line)
# Get expected matches
expected_matches = find_30_plus_char_matches(test_dir)
# Check if all answer files actually have matches
for filename in answer_files:
if filename not in expected_matches:
print(f"❌ File {filename} listed in answer but has no valid 30+ character match")
return False
# Check if all expected matches are in answer
for filename in expected_matches:
if filename not in answer_files:
print(f"❌ Missing match for {filename} in answer file")
return False
print("✅ All matches are correct")
return True
except Exception as e:
print(f"❌ Error verifying matches: {e}")
return False
def verify_files_exist(test_dir: Path) -> bool:
"""Verify that all files mentioned in answer.txt actually exist."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
if not content:
return True # No files to verify
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
file_path = test_dir / line
if not file_path.exists():
print(f"❌ File mentioned in answer does not exist: {line}")
return False
print("✅ All files mentioned in answer exist")
return True
except Exception as e:
print(f"❌ Error verifying file existence: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying File Filtering Task: Find Files with Common Substring...")
# Define verification steps
verification_steps = [
("Answer File Exists", verify_answer_file_exists),
("Answer Format", verify_answer_format),
("Files Exist", verify_files_exist),
("Matches are Correct", verify_matches_are_correct),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ File filtering task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/file_context/uppercase/description.md
================================================
# File Context Task: Convert Files to Uppercase
## 📋 Task Description
You need to process 5 text files (file_01.txt to file_05.txt) and convert their content to uppercase format.
## 🎯 Task Objectives
1. **Create an uppercase directory** in the test environment root
2. **Convert each file** from file_01.txt to file_05.txt to uppercase
3. **Save converted files** in the uppercase/ directory with the same names
================================================
FILE: tasks/filesystem/easy/file_context/uppercase/meta.json
================================================
{
"task_id": "uppercase",
"task_name": "Uppercase",
"category_id": "file_context",
"category_name": "File Context",
"description": "Copy file_01.txt-file_05.txt into an uppercase/ folder and convert the contents of every file to uppercase text.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content transformation",
"batch processing"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_context/\n \u251c\u2500\u2500 file_01.txt\n \u251c\u2500\u2500 file_02.txt\n \u251c\u2500\u2500 file_03.txt\n \u251c\u2500\u2500 file_04.txt\n \u251c\u2500\u2500 file_05.txt\n \u251c\u2500\u2500 file_06.txt\n \u251c\u2500\u2500 file_07.txt\n \u251c\u2500\u2500 file_08.txt\n \u251c\u2500\u2500 file_09.txt\n \u251c\u2500\u2500 file_10.txt\n \u251c\u2500\u2500 file_11.txt\n \u251c\u2500\u2500 file_12.txt\n \u251c\u2500\u2500 file_13.txt\n \u251c\u2500\u2500 file_14.txt\n \u251c\u2500\u2500 file_15.txt\n \u251c\u2500\u2500 file_16.txt\n \u251c\u2500\u2500 file_17.txt\n \u251c\u2500\u2500 file_18.txt\n \u251c\u2500\u2500 file_19.txt\n \u251c\u2500\u2500 file_20.txt\n \u2514\u2500\u2500 large_file.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/file_context/uppercase/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Context Task: Convert Files to Uppercase
"""
import sys
from pathlib import Path
import os
import re
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_uppercase_directory_exists(test_dir: Path) -> bool:
"""Verify that the uppercase directory exists."""
uppercase_dir = test_dir / "uppercase"
if not uppercase_dir.exists():
print("❌ Directory 'uppercase' not found")
return False
if not uppercase_dir.is_dir():
print("❌ 'uppercase' exists but is not a directory")
return False
print("✅ Uppercase directory found")
return True
def verify_uppercase_files_exist(test_dir: Path) -> bool:
"""Verify that all 5 uppercase files exist."""
uppercase_dir = test_dir / "uppercase"
for i in range(1, 6):
filename = f"file_{i:02d}.txt"
file_path = uppercase_dir / filename
if not file_path.exists():
print(f"❌ File '{filename}' not found in uppercase directory")
return False
print("✅ All 5 uppercase files found")
return True
def verify_uppercase_content(test_dir: Path) -> bool:
"""Verify that uppercase files contain the correct uppercase content."""
uppercase_dir = test_dir / "uppercase"
for i in range(1, 6):
filename = f"file_{i:02d}.txt"
original_file = test_dir / filename
uppercase_file = uppercase_dir / filename
if not original_file.exists():
print(f"❌ Original file '{filename}' not found")
return False
try:
original_content = original_file.read_text()
uppercase_content = uppercase_file.read_text()
# Check if uppercase content is the uppercase version of original
expected_uppercase = original_content.upper()
if uppercase_content != expected_uppercase:
print(f"❌ File '{filename}' content is not properly converted to uppercase")
return False
except Exception as e:
print(f"❌ Error reading file '{filename}': {e}")
return False
print("✅ All uppercase files contain correct uppercase content")
return True
def verify_answer_file_exists(test_dir: Path) -> bool:
"""Verify that the answer.txt file exists in the uppercase directory."""
uppercase_dir = test_dir / "uppercase"
answer_file = uppercase_dir / "answer.txt"
if not answer_file.exists():
print("❌ File 'answer.txt' not found in uppercase directory")
return False
print("✅ Answer file found in uppercase directory")
return True
def verify_answer_format(test_dir: Path) -> bool:
"""Verify that the answer file has the correct format."""
uppercase_dir = test_dir / "uppercase"
answer_file = uppercase_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
if not content:
print("❌ Answer file is empty")
return False
lines = content.split('\n')
# Check if we have exactly 10 lines
if len(lines) != 10:
print(f"❌ Answer file has {len(lines)} lines, expected 10")
return False
for i, line in enumerate(lines, 1):
line = line.strip()
if not line:
print(f"❌ Line {i} is empty")
return False
# Check format: filename:word_count
if ':' not in line:
print(f"❌ Line {i} has incorrect format: {line}")
print(" Expected format: filename:word_count")
return False
parts = line.split(':', 1)
if len(parts) != 2:
print(f"❌ Line {i} has incorrect format: {line}")
print(" Expected format: filename:word_count")
return False
filename, word_count_str = parts
# Check filename format
if not filename.endswith('.txt') or not filename.startswith('file_'):
print(f"❌ Line {i} has invalid filename: {filename}")
return False
# Check word count format (should be integer)
try:
word_count = int(word_count_str)
if word_count <= 0:
print(f"❌ Line {i} has invalid word count: {word_count_str}")
return False
except ValueError:
print(f"❌ Line {i} has non-integer word count: {word_count_str}")
return False
print("✅ Answer format is correct")
return True
except Exception as e:
print(f"❌ Error reading answer file: {e}")
return False
def count_words_in_file(file_path: Path) -> int:
"""Count words in a file."""
try:
content = file_path.read_text()
# Split by whitespace and filter out empty strings
words = [word for word in content.split() if word.strip()]
return len(words)
except Exception as e:
print(f"❌ Error reading file {file_path}: {e}")
return 0
def verify_word_counts_are_correct(test_dir: Path) -> bool:
"""Verify that the word counts in answer.txt are correct."""
uppercase_dir = test_dir / "uppercase"
answer_file = uppercase_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
lines = content.split('\n')
# Expected word counts based on answer.md
expected_counts = [22, 22, 22, 22, 18, 22, 22, 22, 18, 20]
# Create a set of expected file entries for easier checking
expected_entries = set()
for i in range(1, 11):
filename = f"file_{i:02d}.txt"
expected_count = expected_counts[i - 1]
if i == 6: # Special case for file_06.txt: can be 21 or 22
expected_entries.add(f"{filename}:21")
expected_entries.add(f"{filename}:22")
else:
expected_entries.add(f"{filename}:{expected_count}")
# Check each line in the answer file
found_entries = set()
for line in lines:
line = line.strip()
if line in expected_entries:
found_entries.add(line)
else:
print(f"❌ Invalid entry: {line}")
return False
# Check if we found all expected entries
if len(found_entries) != 10:
print(f"❌ Found {len(found_entries)} entries, expected 10")
missing = expected_entries - found_entries
if missing:
print(f" Missing entries: {missing}")
return False
print("✅ All word counts are correct")
return True
except Exception as e:
print(f"❌ Error verifying word counts: {e}")
return False
def verify_all_files_are_included(test_dir: Path) -> bool:
"""Verify that all 10 files are included in the answer."""
uppercase_dir = test_dir / "uppercase"
answer_file = uppercase_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
lines = content.split('\n')
# Check that all 10 files are present
found_files = set()
for line in lines:
parts = line.split(':', 1)
filename = parts[0]
found_files.add(filename)
expected_files = {f"file_{i:02d}.txt" for i in range(1, 11)}
if found_files != expected_files:
missing = expected_files - found_files
extra = found_files - expected_files
if missing:
print(f"❌ Missing files in answer: {missing}")
if extra:
print(f"❌ Extra files in answer: {extra}")
return False
print("✅ All 10 files are included in answer")
return True
except Exception as e:
print(f"❌ Error verifying file inclusion: {e}")
return False
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying Uppercase in: {test_dir}")
print()
# Run all verification checks
checks = [
("Uppercase directory exists", verify_uppercase_directory_exists),
("Uppercase files exist", verify_uppercase_files_exist),
("Uppercase content is correct", verify_uppercase_content),
]
all_passed = True
for check_name, check_func in checks:
print(f"📋 {check_name}...")
if not check_func(test_dir):
all_passed = False
print()
if all_passed:
print("🎉 All verification checks passed!")
sys.exit(0)
else:
print("❌ Some verification checks failed!")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/file_property/largest_rename/description.md
================================================
# Largest File Rename Task
## 📋 Task Description
Rename the largest `.jpg` file in the test directory to `largest.jpg` based on file size.
## 🎯 Task Objectives
1. **Find all `.jpg` files** in the test directory
2. **Determine which `.jpg` file is the largest** by file size
3. **Rename the largest `.jpg` file to `largest.jpg`**
================================================
FILE: tasks/filesystem/easy/file_property/largest_rename/meta.json
================================================
{
"task_id": "largest_rename",
"task_name": "Largest File Rename",
"category_id": "file_property",
"category_name": "File Property",
"description": "Identify the largest .jpg in the workspace and rename it to largest.jpg while leaving the other files untouched.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"file organization",
"attribute inspection"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_property/\n \u251c\u2500\u2500 bear.jpg\n \u251c\u2500\u2500 bridge.jpg\n \u251c\u2500\u2500 bus.MOV\n \u251c\u2500\u2500 random_file_1.txt\n \u251c\u2500\u2500 random_file_2.txt\n \u251c\u2500\u2500 random_file_3.txt\n \u251c\u2500\u2500 road.MOV\n \u2514\u2500\u2500 sg.jpg",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/file_property/largest_rename/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Largest File Rename Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_sg_jpg_not_exists(test_dir: Path) -> bool:
"""Verify that sg.jpg does not exist."""
sg_file = test_dir / "sg.jpg"
if sg_file.exists():
print("❌ sg.jpg still exists (should be renamed)")
return False
print("✅ sg.jpg does not exist")
return True
def verify_largest_jpg_exists(test_dir: Path) -> bool:
"""Verify that largest.jpg exists."""
largest_file = test_dir / "largest.jpg"
if not largest_file.exists():
print("❌ largest.jpg does not exist")
return False
print("✅ largest.jpg exists")
return True
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying largest file rename in: {test_dir}")
# Run all verification checks
checks = [
("sg.jpg does not exist", verify_sg_jpg_not_exists),
("largest.jpg exists", verify_largest_jpg_exists)
]
all_passed = True
for check_name, check_func in checks:
print(f"\n📋 Checking: {check_name}")
if not check_func(test_dir):
all_passed = False
if all_passed:
print("\n🎉 All verification checks passed!")
sys.exit(0)
else:
print("\n❌ Some verification checks failed!")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/file_property/txt_merging/description.md
================================================
# Text File Merging Task
## 📋 Task Description
Merge all `.txt` files in the test directory into a single file called `merge.txt`. The merged file should contain the content from all `.txt` files.
## 🎯 Task Objectives
1. **Read all `.txt` files** in the test directory
2. **Create a new file** called `merge.txt` in the test directory
3. **Write the content** from all `.txt` files into `merge.txt`
4. **The order** of content doesn't matter - as long as all content from all `.txt` files is present in `merge.txt`
================================================
FILE: tasks/filesystem/easy/file_property/txt_merging/meta.json
================================================
{
"task_id": "txt_merging",
"task_name": "Text File Merging",
"category_id": "file_property",
"category_name": "File Property",
"description": "Combine the contents of every .txt file into a single merge.txt file so the archive has one consolidated view.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content consolidation",
"file automation"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_property/\n \u251c\u2500\u2500 bear.jpg\n \u251c\u2500\u2500 bridge.jpg\n \u251c\u2500\u2500 bus.MOV\n \u251c\u2500\u2500 random_file_1.txt\n \u251c\u2500\u2500 random_file_2.txt\n \u251c\u2500\u2500 random_file_3.txt\n \u251c\u2500\u2500 road.MOV\n \u2514\u2500\u2500 sg.jpg",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/file_property/txt_merging/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Text File Merging Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def get_expected_contents():
"""Return the expected content from each .txt file."""
return [
"O rErmZ4tDgzMNoxn1oNfQhT1TRpy9w0tQPGTcrsaoMFrrgt9bY5mgBxO6q8c8lZywXxEEBWW4i6Jh9NbAtYtRKvkzB4bshGIMzn2G1 rDTpKJj",
"DmRrDFFaIl1mPubzSJJaN4aMeZyBHqVxZe5tpztHQ9zSe6b69Hnl7coqeNJXHXU2EnaDnyhYxZSWHPn3IWLsLGWrx7py8d37Z8blMnh7VDUH7hAMamhLRO8lfUVV1roM8a0njnW9evXRq5AoNTt8Tv7kQ5LmLe6Z66MZwtjckRAXmOB4x3AYbbxLULYZAxitW1KNG1yTaDOYZQhtKdZkX1XqytzBl9dRXI4gk91ZlVHLOiujwUa89EVsdjayKeCc21gCJMXvbhDSOGAs6dXZEHuaHQnnBdM19X3TwPgfDONyhlc pjwoQ45D56UQVWxwNIJUTgwS1vctYOx4XFpMgf3PRQ7zZdfhIuPBFdQwnQvYUeQbWa5gnyMO9FVSU0vm9uccbJQvkcEAJzMkEh9i7z6EEixtbwVedlTGWL2XBwjenRdf2qsOgvJo8Dyuvf35ieCFMG7wR7200rs GJZ5bRdx4R2gGOWVMi3MOBrqcw3KhbcpJtdQoKMALEjBMrY7VYKtAZNI6LoXX OOTJZ3x3usHRJY0gMtKhh6OJ 37aknvBwNYJ0IRWYWaeJ8LBwJyO6ZV3ZJ0palISQvGaHEZ0olHnK2iNCTxqxvF8J7EdIdIPYssl5f0XgPl6",
"aFCzXJbJq02zlCKnyarJnPUiwVIuUrQci3fZvGD53F5fUsKDUlEwO5 ANJ2VgBnJ5cuBJzjILcM9AxTvyNZ5NPIHjSCo5O20K"
]
def verify_merge_file_exists(test_dir: Path) -> bool:
"""Verify that merge.txt exists in the test directory."""
merge_file = test_dir / "merge.txt"
if not merge_file.exists():
print("❌ merge.txt not found")
return False
if not merge_file.is_file():
print("❌ merge.txt exists but is not a file")
return False
print("✅ merge.txt exists")
return True
def verify_merge_file_contents(test_dir: Path) -> bool:
"""Verify that merge.txt contains all expected content strings."""
merge_file = test_dir / "merge.txt"
expected_contents = get_expected_contents()
try:
with open(merge_file, 'r', encoding='utf-8') as f:
merge_content = f.read()
except Exception as e:
print(f"❌ Failed to read merge.txt: {e}")
return False
# Check that each expected content string is present in the merged file
missing_contents = []
for content in expected_contents:
if content not in merge_content:
missing_contents.append(content[:50] + "..." if len(content) > 50 else content)
if missing_contents:
print(f"❌ Missing content in merge.txt:")
for content in missing_contents:
print(f" - {content}")
return False
print("✅ merge.txt contains all expected content")
return True
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying text file merging in: {test_dir}")
# Run all verification checks
checks = [
("Merge file existence", verify_merge_file_exists),
("Merge file contents", verify_merge_file_contents)
]
all_passed = True
for check_name, check_func in checks:
print(f"\n📋 Checking: {check_name}")
if not check_func(test_dir):
all_passed = False
if all_passed:
print("\n🎉 All verification checks passed!")
sys.exit(0)
else:
print("\n❌ Some verification checks failed!")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/folder_structure/structure_analysis/description.md
================================================
# Directory Structure Analysis Task
You need to recursively traverse the entire folder structure under the main directory and count the total number of `.py` files in the entire directory (including all subdirectories).
Write the answer (just a single number) in a file named `structure_analysis.txt` in the main directory (at the same level as the `complex_structure` folder).
You should not change or delete any existed files.
Do not try to use python code.
================================================
FILE: tasks/filesystem/easy/folder_structure/structure_analysis/meta.json
================================================
{
"task_id": "structure_analysis",
"task_name": "Structure Analysis",
"category_id": "folder_structure",
"category_name": "Folder Structure",
"description": "Recursively inspect the complex_structure tree, count all .py files, and save the total as the only line of structure_analysis.txt.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data extraction",
"filesystem traversal"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "folder_structure/\n \u2514\u2500\u2500 complex_structure/\n \u251c\u2500\u2500 deeply/\n \u2502 \u2514\u2500\u2500 nested/\n \u2502 \u2514\u2500\u2500 folder/\n \u2502 \u2514\u2500\u2500 structure/\n \u251c\u2500\u2500 empty_folder/\n \u251c\u2500\u2500 folder_lxkHt_0_1/\n \u2502 \u2514\u2500\u2500 file_PeLzC_0.txt\n \u251c\u2500\u2500 folder_QdTAj_0_2/\n \u2502 \u251c\u2500\u2500 folder_eXccj_1_0/\n \u2502 \u2502 \u251c\u2500\u2500 folder_Mqlwh_2_1/\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_cKxcP_3_3/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_BPTMK_4_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_RHtBP_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_QNqjq_4_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_gRwPE_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_jVlpp_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_vJuHz_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_XdXYJ_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_KvkKi_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_gGxLG_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_Hzkxo_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_XRjeh_1.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_vIBIt_4_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_kRDNS_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_wFSjJ_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_NyBSO_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_EOCNf_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_gmrXA_0.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_NcruA_3_1/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_bLWDj_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_WAftR_0.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_qCDFI_3_2/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_eSMOJ_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_oxADy_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_RTbbc_1.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_QVHUU_3_0/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_FEPTK_4_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_GHoMC_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_rAMYd_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_iBDUY_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_IJCaw_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_VRXgp_5_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_hkUmS_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_nqLAf_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_XflmA_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_FlPoK_4_3/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_hSVNm_5_3/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_klnbn_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_iZuEl_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_LqAmy_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_LcURj_5_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_RgwOS_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_ZHnYb_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_tuZQJ_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_LHuIx_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_asJnB_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_EzLdu_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_ndhsJ_4_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_CUSXK_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_DpiuM_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_pSqeG_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_pstmE_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_YwdJt_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_StlsP_5_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_kriBJ_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_XCEdm_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_ToDjh_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_xbIVx_0.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_PJBok_4_4/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_mzxaf_5_0/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_ILBzj_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_MTGMm_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_zBDqz_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_sULMj_5_1/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_BHziw_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_sIjiu_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_VqNkB_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_vypSi_5_3/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_kZbIm_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_sOBtE_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_ZLGHy_5_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_azaFF_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_nAFRe_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_mIkQU_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_sGPxd_1.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_VTbEG_4_2/\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_HtYLg_0.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_JXjMd_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_tPccB_2.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_BuOSw_1.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_TpoqE_0.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 folder_wTvun_3_4/\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_GyhyE_1.txt\n \u2502 \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_POsla_2.txt\n \u2502 \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_tSsvk_0.txt\n \u2502 \u2502 \u2502 \u251c\u2500\u2500 file_irNju_0.txt\n \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_jYBRm_1.txt\n \u2502 \u2502 \u251c\u2500\u2500 folder_YlJLI_2_0/\n \u2502 \u2502 \u2502 \u2514\u2500\u2500 file_FpFSL_0.txt\n \u2502 \u2502 \u251c\u2500\u2500 file_cFgBr_2.txt\n \u2502 \u2502 \u251c\u2500\u2500 file_lKEWN_1.txt\n \u2502 \u2502 \u2514\u2500\u2500 file_ZEWFP_0.txt\n \u2502 \u2514\u2500\u2500 file_ayUCH_0.txt\n \u251c\u2500\u2500 folder_xtgyi_0_0/\n \u2502 \u2514\u2500\u2500 file_BvSOB_0.txt\n \u251c\u2500\u2500 mixed_content/\n \u2502 \u2514\u2500\u2500 images_and_text/\n \u2502 \u2514\u2500\u2500 notes.txt\n \u251c\u2500\u2500 project/\n \u2502 \u251c\u2500\u2500 docs/\n \u2502 \u2502 \u2514\u2500\u2500 archive/\n \u2502 \u2502 \u2514\u2500\u2500 2023/\n \u2502 \u2502 \u2514\u2500\u2500 reports/\n \u2502 \u2502 \u251c\u2500\u2500 report_0.txt\n \u2502 \u2502 \u251c\u2500\u2500 report_1.txt\n \u2502 \u2502 \u2514\u2500\u2500 report_2.txt\n \u2502 \u2514\u2500\u2500 src/\n \u2502 \u2514\u2500\u2500 main/\n \u2502 \u2514\u2500\u2500 resources/\n \u2514\u2500\u2500 m.py",
"stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/folder_structure/structure_analysis/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Directory Structure Analysis Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_structure_analysis_file_exists(test_dir: Path) -> bool:
"""Verify that the structure_analysis.txt file exists."""
analysis_file = test_dir / "structure_analysis.txt"
if not analysis_file.exists():
print("❌ File 'structure_analysis.txt' not found")
return False
print("✅ structure_analysis.txt file found")
return True
def verify_structure_analysis_content(test_dir: Path) -> bool:
"""Verify that the structure_analysis.txt file contains the correct count."""
analysis_file = test_dir / "structure_analysis.txt"
try:
content = analysis_file.read_text().strip()
if not content:
print("❌ structure_analysis.txt file is empty")
return False
# The expected answer is 1
expected_count = 1
# Check if content is exactly "1"
if content != str(expected_count):
print(f"❌ Expected '{expected_count}', but found: '{content}'")
return False
print(f"✅ Python file count is correct: {content}")
return True
except Exception as e:
print(f"❌ Error reading structure_analysis.txt file: {e}")
return False
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying Directory Structure Analysis Task in: {test_dir}")
print()
# Define verification steps
verification_steps = [
("Structure Analysis File Exists", verify_structure_analysis_file_exists),
("Python File Count is Correct", verify_structure_analysis_content),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"📋 {step_name}...")
if not verify_func(test_dir):
all_passed = False
print()
# Final result
if all_passed:
print("🎉 All verification checks passed!")
sys.exit(0)
else:
print("❌ Some verification checks failed!")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/legal_document/file_reorganize/description.md
================================================
# Legal Document File Reorganization Task
**Overview**
The folder "legal_files/" contains multiple versions of the Stock Purchase Agreement (Preferred_Stock_Purchase_Agreement_v0.txt through Preferred_Stock_Purchase_Agreement_v10.txt).
## Task
Your task is to:
1. Identify the final version of the document among the different versions
2. Create a folder named `final_version` inside the `legal_files/` directory
3. Create an **empty file** with the same name as the final version in the newly created `final_version/` folder
4. Keep the original file in its original location
Note: Due to the large file size, you only need to create an empty file (not copy the content). The filename should remain unchanged in the `final_version/` folder.
================================================
FILE: tasks/filesystem/easy/legal_document/file_reorganize/meta.json
================================================
{
"task_id": "file_reorganize",
"task_name": "File Reorganize",
"category_id": "legal_document",
"category_name": "Legal Document",
"description": "Determine the final Stock Purchase Agreement version and create an empty copy of that filename inside legal_files/final_version/.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"file organization",
"version management"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "legal_document/\n \u2514\u2500\u2500 legal_files/\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v0.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v1.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v2.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v3.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v4.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v5.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v6.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v7.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v8.txt\n \u251c\u2500\u2500 Preferred_Stock_Purchase_Agreement_v9.txt\n \u2514\u2500\u2500 Preferred_Stock_Purchase_Agreement_v10.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip",
"stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents"
}
}
================================================
FILE: tasks/filesystem/easy/legal_document/file_reorganize/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Legal Document File Reorganization Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_final_version_folder_exists(test_dir: Path) -> bool:
"""Verify that the final_version folder exists in legal_files."""
final_version_dir = test_dir / "legal_files" / "final_version"
if not final_version_dir.exists():
print("❌ Folder 'legal_files/final_version' not found")
return False
if not final_version_dir.is_dir():
print("❌ 'legal_files/final_version' exists but is not a directory")
return False
print("✅ Folder 'legal_files/final_version' found")
return True
def verify_target_file_exists(test_dir: Path) -> bool:
"""Verify that Preferred_Stock_Purchase_Agreement_v10.txt exists in final_version folder."""
target_file = test_dir / "legal_files" / "final_version" / "Preferred_Stock_Purchase_Agreement_v10.txt"
if not target_file.exists():
print("❌ File 'legal_files/final_version/Preferred_Stock_Purchase_Agreement_v10.txt' not found")
return False
if not target_file.is_file():
print("❌ 'Preferred_Stock_Purchase_Agreement_v10.txt' exists but is not a file")
return False
print("✅ Target file 'Preferred_Stock_Purchase_Agreement_v10.txt' found in final_version folder")
return True
def verify_original_file_preserved(test_dir: Path) -> bool:
"""Verify that the original v10 file is still in place."""
original_file = test_dir / "legal_files" / "Preferred_Stock_Purchase_Agreement_v10.txt"
if not original_file.exists():
print("❌ Original file 'Preferred_Stock_Purchase_Agreement_v10.txt' was removed")
return False
print("✅ Original file 'Preferred_Stock_Purchase_Agreement_v10.txt' preserved")
return True
def verify_only_v10_in_final_version(test_dir: Path) -> bool:
"""Verify that final_version folder contains only v10 file."""
final_version_dir = test_dir / "legal_files" / "final_version"
# Get all files in final_version folder
files = list(final_version_dir.iterdir())
# Filter out directories, keep only files
files_only = [f for f in files if f.is_file()]
if len(files_only) != 1:
print(f"❌ final_version folder should contain exactly 1 file, but found {len(files_only)}")
for f in files_only:
print(f" - {f.name}")
return False
# Check if the only file is v10
if files_only[0].name != "Preferred_Stock_Purchase_Agreement_v10.txt":
print(f"❌ final_version folder contains wrong file: {files_only[0].name}")
print(" Expected: Preferred_Stock_Purchase_Agreement_v10.txt")
return False
print("✅ final_version folder contains only Preferred_Stock_Purchase_Agreement_v10.txt")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Legal Document File Reorganization Task...")
# Define verification steps
verification_steps = [
("Final Version Folder Exists", verify_final_version_folder_exists),
("Target File Exists", verify_target_file_exists),
("Only V10 in Final Version", verify_only_v10_in_final_version),
("Original File Preserved", verify_original_file_preserved),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Legal document file reorganization completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/papers/papers_counting/description.md
================================================
# File Context Task: Count HTML Files
## 📋 Task Description
You need to count the number of HTML files in the given directory and write the count to a file.
## 🎯 Task Objectives
1. **Count HTML files** in the given directory
2. **Create a file** named `count.txt` in the same directory
3. **Write the count** (just the number) to `count.txt`
## 📝 Expected Output
- File `count.txt` containing only the number of HTML files found
================================================
FILE: tasks/filesystem/easy/papers/papers_counting/meta.json
================================================
{
"task_id": "papers_counting",
"task_name": "Papers Counting",
"category_id": "papers",
"category_name": "Papers",
"description": "Count how many .html papers live in the directory and write just that number into count.txt.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data extraction",
"reporting"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "papers/\n \u251c\u2500\u2500 1707.06347.html\n \u251c\u2500\u2500 2105.04165.html\n \u251c\u2500\u2500 2201.11903.html\n \u251c\u2500\u2500 2303.08774.html\n \u251c\u2500\u2500 2306.08640.html\n \u251c\u2500\u2500 2310.02255.html\n \u251c\u2500\u2500 2310.08446.html\n \u251c\u2500\u2500 2312.00849.html\n \u251c\u2500\u2500 2312.07533.html\n \u251c\u2500\u2500 2312.11805.html\n \u251c\u2500\u2500 2402.00253.html\n \u251c\u2500\u2500 2402.03300.html\n \u251c\u2500\u2500 2403.05530.html\n \u251c\u2500\u2500 2404.13046.html\n \u251c\u2500\u2500 2404.14367.html\n \u251c\u2500\u2500 2404.14396.html\n \u251c\u2500\u2500 2405.09818.html\n \u251c\u2500\u2500 2405.13911.html\n \u251c\u2500\u2500 2405.16473.html\n \u251c\u2500\u2500 2405.16640.html\n \u251c\u2500\u2500 2406.08478.html\n \u251c\u2500\u2500 2406.16852.html\n \u251c\u2500\u2500 2406.17294.html\n \u251c\u2500\u2500 2407.01284.html\n \u251c\u2500\u2500 2407.01509.html\n \u251c\u2500\u2500 2407.21783.html\n \u251c\u2500\u2500 2408.03326.html\n \u251c\u2500\u2500 2408.12528.html\n \u251c\u2500\u2500 2409.19256.html\n \u251c\u2500\u2500 2410.05993.html\n \u251c\u2500\u2500 2410.06166.html\n \u251c\u2500\u2500 2410.10563.html\n \u251c\u2500\u2500 2410.13848.html\n \u251c\u2500\u2500 2410.17885.html\n \u251c\u2500\u2500 2410.21276.html\n \u251c\u2500\u2500 2411.07975.html\n \u251c\u2500\u2500 2411.10442.html\n \u251c\u2500\u2500 2411.11930.html\n \u251c\u2500\u2500 2411.14432.html\n \u251c\u2500\u2500 2412.05271.html\n \u251c\u2500\u2500 2412.08443.html\n \u251c\u2500\u2500 2412.10302.html\n \u251c\u2500\u2500 2412.15115.html\n \u251c\u2500\u2500 2412.16720.html\n \u251c\u2500\u2500 2412.17256.html\n \u251c\u2500\u2500 2412.18319.html\n \u251c\u2500\u2500 2412.20631.html\n \u251c\u2500\u2500 2501.04686.html\n \u251c\u2500\u2500 2501.06186.html\n \u251c\u2500\u2500 2501.12599.html\n \u251c\u2500\u2500 2501.12948.html\n \u251c\u2500\u2500 2501.17811.html\n \u251c\u2500\u2500 2502.01456.html\n \u251c\u2500\u2500 2502.09621.html\n \u251c\u2500\u2500 2502.10391.html\n \u251c\u2500\u2500 2502.13923.html\n \u251c\u2500\u2500 2503.01785.html\n \u251c\u2500\u2500 2503.06520.html\n \u251c\u2500\u2500 2503.06749.html\n \u251c\u2500\u2500 2503.07065.html\n \u251c\u2500\u2500 2503.07365.html\n \u251c\u2500\u2500 2503.07536.html\n \u251c\u2500\u2500 2503.10291.html\n \u251c\u2500\u2500 2503.10615.html\n \u251c\u2500\u2500 2503.12937.html\n \u251c\u2500\u2500 2503.13939.html\n \u251c\u2500\u2500 2503.14476.html\n \u251c\u2500\u2500 2503.17352.html\n \u251c\u2500\u2500 2503.18892.html\n \u251c\u2500\u2500 2503.19786.html\n \u251c\u2500\u2500 2503.20783.html\n \u251c\u2500\u2500 2503.21620.html\n \u251c\u2500\u2500 2503.21776.html\n \u251c\u2500\u2500 2503.22679.html\n \u251c\u2500\u2500 2504.02587.html\n \u251c\u2500\u2500 2504.05599.html\n \u251c\u2500\u2500 2504.07491.html\n \u251c\u2500\u2500 2504.07934.html\n \u251c\u2500\u2500 2504.07954.html\n \u251c\u2500\u2500 2504.11455.html\n \u251c\u2500\u2500 2504.14945.html\n \u251c\u2500\u2500 2504.16656.html\n \u251c\u2500\u2500 2505.00703.html\n \u2514\u2500\u2500 arxiv_2025.bib",
"stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/papers/papers_counting/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Paper Counting Task: Count HTML Files
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_count_file_exists(test_dir: Path) -> bool:
"""Verify that the count.txt file exists."""
count_file = test_dir / "count.txt"
if not count_file.exists():
print("❌ File 'count.txt' not found")
return False
print("✅ count.txt file found")
return True
def verify_count_content(test_dir: Path) -> bool:
"""Verify that count.txt contains the correct number (83)."""
count_file = test_dir / "count.txt"
try:
content = count_file.read_text().strip()
# Check if content is exactly "83"
if content == "83":
print("✅ count.txt contains the correct number: 83")
return True
else:
print(f"❌ count.txt contains '{content}' but expected '83'")
return False
except Exception as e:
print(f"❌ Error reading count.txt: {e}")
return False
def verify_actual_html_count(test_dir: Path) -> bool:
"""Verify that there are actually 83 HTML files in the directory."""
html_files = list(test_dir.glob("*.html"))
count = len(html_files)
if count == 83:
print(f"✅ Verified: There are exactly {count} HTML files in the directory")
return True
else:
print(f"⚠️ Found {count} HTML files in the directory (expected 83)")
return False
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying HTML file count in: {test_dir}")
# Define verification steps
verification_steps = [
("Count File Exists", verify_count_file_exists),
("Count Content", verify_count_content),
("Actual HTML Count", verify_actual_html_count),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ HTML file count is correct!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/student_database/duplicate_name/description.md
================================================
Please help me identify any duplicate name from the list of all the 150 students. Do not use python code. You only need to find **any one** duplicate name. Then generate a `namesake.txt` file to record the result in the following format, with only three lines. Note: when recording the name, replace underscores with spaces.
name: xxx
count: xxx
ids: xxx, xxx, ...
================================================
FILE: tasks/filesystem/easy/student_database/duplicate_name/meta.json
================================================
{
"task_id": "duplicate_name",
"task_name": "Duplicate Name",
"category_id": "student_database",
"category_name": "Student Database",
"description": "Search the 150 student folders for any repeated full name and document the name, count, and ids in namesake.txt.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"pattern analysis",
"data validation"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "student_database/\n \u251c\u2500\u2500 20101250_Patricia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20101701_Isabella_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20102572_Michael_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104233_Robert_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104498_Sarah_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104653_Sophia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104675_Michael_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104846_Christopher_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20107487_Mia_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20108742_Sarah_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20109144_Emma_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20109803_Oliver_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20111634_Isabella_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20112439_Christopher_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20113368_William_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20113603_Robert_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20114397_Isabella_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20114869_Ethan_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115252_Mason_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115632_Elizabeth_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115753_Charlotte_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115924_Michael_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20116232_Olivia_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20119528_Thomas_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20122427_Karen_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20122977_Evelyn_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20123376_Joseph_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20125451_Barbara_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126203_Barbara_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126394_Olivia_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126471_Ethan_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20127423_John_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20128249_Oliver_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20128879_Christopher_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20129898_Jessica_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20131271_Olivia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20131518_Sophia_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132026_Isabella_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132370_James_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132669_Noah_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20133527_Mason_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20133697_Isabella_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20135821_Thomas_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20136681_Benjamin_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20136890_Benjamin_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20137514_Lucas_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139234_Harper_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139637_Noah_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139647_Patricia_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20141421_Linda_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20142085_William_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20142383_Amelia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20143406_Susan_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20143830_James_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146035_Christopher_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146277_William_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146279_Christopher_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20147301_James_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20147789_James_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20148681_John_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20148778_Susan_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20149712_Jessica_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20151012_Harper_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153174_Benjamin_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153412_Charlotte_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153606_James_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153687_Richard_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20154518_John_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20154710_Benjamin_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156469_Jennifer_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156522_Jennifer_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156851_Noah_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20157943_Harper_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158266_Sophia_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158294_Sophia_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158819_Sarah_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20159113_John_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20159695_James_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20161279_William_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20162253_Mason_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20162542_Mia_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20163356_Ava_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20164515_Patricia_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20164801_Noah_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20165511_Mary_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166436_Christopher_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166487_Barbara_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166564_Ava_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166998_Ava_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20168311_Lucas_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20168491_Karen_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20169515_Thomas_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171050_Christopher_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171406_Mary_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171613_Ethan_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20172106_Isabella_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173259_Michael_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173492_Richard_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173501_Mary_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173517_Susan_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20174207_Richard_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20174369_Mary_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20175314_William_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20176169_Lucas_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20176947_Noah_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20177389_James_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20178687_Isabella_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20179461_William_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20179690_Linda_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20181056_Sarah_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20182020_Patricia_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20182390_Ethan_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20183149_David_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20183219_Charlotte_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20184489_Jessica_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20186154_Charlotte_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20186510_James_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187107_David_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187144_Mary_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187892_Christopher_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187921_Mary_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187967_Sarah_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20188937_James_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189123_Mary_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189192_Olivia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189268_Emma_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189854_William_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20191265_Joseph_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20192725_Robert_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194054_Michael_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194160_Benjamin_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194164_Sarah_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194525_John_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20195164_Jennifer_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20195982_David_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196776_William_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196896_Olivia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196961_Joseph_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196998_Ethan_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20198548_Evelyn_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199036_Benjamin_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199583_Mary_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199735_Mason_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199872_Sophia_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199980_James_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20201385_John_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20201800_John_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20202548_Robert_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20203855_Mia_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u2514\u2500\u2500 20204611_Sarah_Wilson/\n \u251c\u2500\u2500 basic_info.txt\n \u2514\u2500\u2500 recommendation_letter.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/student_database/duplicate_name/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Task: Find Duplicate Names
Simplified version that only checks against expected results without folder validation
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_namesake_file_exists(test_dir: Path) -> bool:
"""Verify that the namesake.txt file exists."""
namesake_file = test_dir / "namesake.txt"
if not namesake_file.exists():
print("❌ File 'namesake.txt' not found")
return False
print("✅ Namesake file found")
return True
def parse_namesake_file(test_dir: Path) -> dict:
"""Parse the namesake.txt file and return structured data."""
namesake_file = test_dir / "namesake.txt"
try:
content = namesake_file.read_text()
lines = content.strip().split('\n')
namesakes = {}
current_line = 0
while current_line < len(lines):
# Skip blank lines
if not lines[current_line].strip():
current_line += 1
continue
# Check if we have enough lines for a complete group
if current_line + 2 >= len(lines):
print(f"❌ Incomplete group at line {current_line + 1}")
return {}
# Parse group
name_line = lines[current_line].strip()
count_line = lines[current_line + 1].strip()
ids_line = lines[current_line + 2].strip()
# Extract name
if not name_line.startswith("name: "):
print(f"❌ Invalid name line format at line {current_line + 1}: {name_line}")
return {}
name = name_line.replace("name: ", "").strip()
# Extract count
if not count_line.startswith("count: "):
print(f"❌ Invalid count line format at line {current_line + 2}: {count_line}")
return {}
count_str = count_line.replace("count: ", "").strip()
try:
count = int(count_str)
except ValueError:
print(f"❌ Invalid count format: {count_str}")
return {}
# Extract IDs
if not ids_line.startswith("ids: "):
print(f"❌ Invalid ids line format at line {current_line + 3}: {ids_line}")
return {}
ids_str = ids_line.replace("ids: ", "").strip()
ids = [id.strip() for id in ids_str.split(",")]
namesakes[name] = {
'count': count,
'ids': ids
}
current_line += 4 # Skip to next group (after blank line)
return namesakes
except Exception as e:
print(f"❌ Error parsing namesake file: {e}")
return {}
def verify_against_expected_results(namesakes: dict) -> bool:
"""Verify that exactly 1 duplicate name is found and it is correct."""
# Expected duplicate names from answer.md (hardcoded)
expected_duplicates = {
'Isabella Smith': ['20132026', '20133697'],
'Ava Lopez': ['20166564', '20166998'],
'James Moore': ['20159695', '20188937'],
'William Taylor': ['20175314', '20189854'],
'Ethan Wilson': ['20182390', '20196998'],
'Christopher Taylor': ['20128879', '20187892'],
'William Anderson': ['20142085', '20146277'],
'James Anderson': ['20147789', '20153606'],
'Olivia Jones': ['20189192', '20196896'],
'Mason Johnson': ['20115252', '20199735'],
'Benjamin Jackson': ['20153174', '20194160'],
'John Taylor': ['20194525', '20201385'],
'Susan Anderson': ['20148778', '20173517'],
'Christopher Moore': ['20112439', '20146279'],
'Sarah Wilson': ['20158819', '20204611'],
'Sarah Brown': ['20104498', '20108742']
}
# Check if exactly 1 duplicate name is found
if len(namesakes) != 1:
print(f"❌ Expected exactly 1 duplicate name, but found {len(namesakes)}")
return False
print(f"✅ Found exactly 1 duplicate name (as required)")
# Check if the namesake in the file is actually a correct duplicate
for name, data in namesakes.items():
if name not in expected_duplicates:
print(f"❌ '{name}' is not a duplicate name (not in expected list)")
return False
expected_ids = set(expected_duplicates[name])
stated_ids = set(data['ids'])
if expected_ids != stated_ids:
print(f"❌ ID mismatch for '{name}':")
print(f" Expected: {sorted(expected_ids)}")
print(f" Stated: {sorted(stated_ids)}")
return False
# Verify count matches
if data['count'] != 2:
print(f"❌ Count mismatch for '{name}': expected 2, got {data['count']}")
return False
print("✅ The identified duplicate name is correct")
print("✅ All student IDs match expected results")
print("✅ Count is correct (2 for the duplicate name)")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Student Database Task: Find Duplicate Names...")
# Check if namesake file exists
print("\n--- File Existence Check ---")
if not verify_namesake_file_exists(test_dir):
print("\n❌ Basic verification failed, cannot proceed with content verification")
sys.exit(1)
# Parse the file and run content verification
print("\n--- Content Verification ---")
namesakes = parse_namesake_file(test_dir)
if not namesakes:
print("❌ Failed to parse namesake file")
sys.exit(1)
# Verify against expected results
print("\n--- Results Verification ---")
if not verify_against_expected_results(namesakes):
print("\n❌ Task verification: FAIL")
sys.exit(1)
# Final result
print("\n" + "="*50)
print("✅ Namesake identification completed correctly!")
print(f"🎉 Found 1 duplicate name (exactly 1 required)")
print("🎉 Task verification: PASS")
sys.exit(0)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/easy/student_database/recommender_name/description.md
================================================
Please find the recommendation letter for Patricia Jones and identify who wrote it. Generate a `recommender.txt` file with only the author's name.
================================================
FILE: tasks/filesystem/easy/student_database/recommender_name/meta.json
================================================
{
"task_id": "recommender_name",
"task_name": "Recommender Name",
"category_id": "student_database",
"category_name": "Student Database",
"description": "Read Patricia Jones's recommendation letter to capture who signed it and store only that name in recommender.txt.",
"author": "Lingjun Chen",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data extraction",
"document search"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "student_database/\n \u251c\u2500\u2500 20101250_Patricia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20101701_Isabella_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20102572_Michael_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104233_Robert_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104498_Sarah_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104653_Sophia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104675_Michael_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20104846_Christopher_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20107487_Mia_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20108742_Sarah_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20109144_Emma_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20109803_Oliver_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20111634_Isabella_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20112439_Christopher_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20113368_William_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20113603_Robert_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20114397_Isabella_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20114869_Ethan_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115252_Mason_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115632_Elizabeth_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115753_Charlotte_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20115924_Michael_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20116232_Olivia_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20119528_Thomas_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20122427_Karen_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20122977_Evelyn_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20123376_Joseph_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20125451_Barbara_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126203_Barbara_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126394_Olivia_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20126471_Ethan_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20127423_John_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20128249_Oliver_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20128879_Christopher_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20129898_Jessica_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20131271_Olivia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20131518_Sophia_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132026_Isabella_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132370_James_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20132669_Noah_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20133527_Mason_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20133697_Isabella_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20135821_Thomas_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20136681_Benjamin_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20136890_Benjamin_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20137514_Lucas_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139234_Harper_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139637_Noah_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20139647_Patricia_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20141421_Linda_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20142085_William_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20142383_Amelia_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20143406_Susan_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20143830_James_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146035_Christopher_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146277_William_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20146279_Christopher_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20147301_James_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20147789_James_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20148681_John_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20148778_Susan_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20149712_Jessica_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20151012_Harper_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153174_Benjamin_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153412_Charlotte_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153606_James_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20153687_Richard_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20154518_John_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20154710_Benjamin_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156469_Jennifer_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156522_Jennifer_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20156851_Noah_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20157943_Harper_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158266_Sophia_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158294_Sophia_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20158819_Sarah_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20159113_John_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20159695_James_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20161279_William_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20162253_Mason_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20162542_Mia_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20163356_Ava_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20164515_Patricia_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20164801_Noah_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20165511_Mary_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166436_Christopher_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166487_Barbara_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166564_Ava_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20166998_Ava_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20168311_Lucas_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20168491_Karen_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20169515_Thomas_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171050_Christopher_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171406_Mary_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20171613_Ethan_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20172106_Isabella_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173259_Michael_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173492_Richard_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173501_Mary_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20173517_Susan_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20174207_Richard_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20174369_Mary_Garcia/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20175314_William_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20176169_Lucas_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20176947_Noah_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20177389_James_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20178687_Isabella_Anderson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20179461_William_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20179690_Linda_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20181056_Sarah_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20182020_Patricia_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20182390_Ethan_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20183149_David_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20183219_Charlotte_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20184489_Jessica_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20186154_Charlotte_Smith/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20186510_James_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187107_David_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187144_Mary_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187892_Christopher_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187921_Mary_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20187967_Sarah_Davis/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20188937_James_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189123_Mary_Martin/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189192_Olivia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189268_Emma_Williams/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20189854_William_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20191265_Joseph_Lopez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20192725_Robert_Martinez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194054_Michael_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194160_Benjamin_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194164_Sarah_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20194525_John_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20195164_Jennifer_Gonzalez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20195982_David_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196776_William_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196896_Olivia_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196961_Joseph_Thomas/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20196998_Ethan_Wilson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20198548_Evelyn_Moore/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199036_Benjamin_Hernandez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199583_Mary_Brown/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199735_Mason_Johnson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199872_Sophia_Jackson/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20199980_James_Rodriguez/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20201385_John_Taylor/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20201800_John_Jones/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20202548_Robert_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u251c\u2500\u2500 20203855_Mia_Miller/\n \u2502 \u251c\u2500\u2500 basic_info.txt\n \u2502 \u2514\u2500\u2500 recommendation_letter.txt\n \u2514\u2500\u2500 20204611_Sarah_Wilson/\n \u251c\u2500\u2500 basic_info.txt\n \u2514\u2500\u2500 recommendation_letter.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/easy/student_database/recommender_name/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Task: Find Recommender Name
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_recommender_file_exists(test_dir: Path) -> bool:
"""Verify that the recommender.txt file exists."""
recommender_file = test_dir / "recommender.txt"
if not recommender_file.exists():
print("❌ File 'recommender.txt' not found")
return False
print("✅ Recommender file found")
return True
def verify_recommender_content(test_dir: Path) -> bool:
"""Verify that the recommender.txt file contains 'Brown'."""
recommender_file = test_dir / "recommender.txt"
try:
content = recommender_file.read_text()
if "Brown" in content:
print("✅ Recommender name 'Brown' found in file")
return True
else:
print("❌ Recommender name 'Brown' not found in file")
print(f" File content: {content.strip()}")
return False
except Exception as e:
print(f"❌ Error reading recommender file: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Student Database Task: Find Recommender Name...")
# Check if recommender file exists
print("\n--- File Existence Check ---")
if not verify_recommender_file_exists(test_dir):
print("\n❌ Basic verification failed, cannot proceed with content verification")
sys.exit(1)
# Verify content
print("\n--- Content Verification ---")
if not verify_recommender_content(test_dir):
print("\n❌ Task verification: FAIL")
sys.exit(1)
# Final result
print("\n" + "="*50)
print("✅ Recommender identification completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/desktop/music_report/description.md
================================================
Please use FileSystem tools to finish the following task:
### 1. Data Loading
- Read and extract song information from `jay_chou/`
- Read and extract song information from `jj_lin/`
### 2. Popularity Score Calculation
For each songs, calculate popularity scores using this formula (keep 3 decimal places):
```
popularity_score = (rating × 0.4) + (play_count_normalized × 0.4) + (year_factor × 0.2)
Where:
- rating: song rating (1-5 scale)
- play_count_normalized: play_count / 250 (0-1 scale)
- year_factor: (2025 - release_year) / 25 (recency bonus)
```
### 3. Generate Analysis Report
Create a file named `music_analysis_report.txt`
in the `music/` folder with the following exact format:
**Lines 1-20**: Each line contains one song in format `songname:popularity_score`
- Sort songs by popularity_score in descending order (highest first)
- Use exact song names as they appear in the source files
- Include all 20 songs from both artists
**Lines 21-25**: Top 5 song names only (one per line)
- List the top 5 songs by popularity_score
- No scores, just song names
- One song name per line
**Important**: The file must contain exactly 25 lines with no additional content, headers, or formatting.
================================================
FILE: tasks/filesystem/standard/desktop/music_report/meta.json
================================================
{
"task_id": "music_report",
"task_name": "Music Report",
"category_id": "desktop",
"category_name": "Desktop",
"description": "Search and analyze desktop music files to generate a scored recommendation list using specified computation rules and criteria.",
"author": "Lingjun Chen",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"data extraction",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "desktop/\n ├── exp_logs/\n │ ├── aug/\n │ │ └── augmentation_log.txt\n │ ├── project_1/\n │ │ ├── data.csv\n │ │ ├── model.py\n │ │ └── README.md\n │ ├── project_2/\n │ │ ├── analysis_report.md\n │ │ └── data_analysis.py\n │ ├── sep/\n │ │ └── september_summary.csv\n │ ├── exp_record.md\n │ ├── experiment_summary.md\n │ └── results_record.csv\n ├── learning/\n │ ├── 2024/\n │ │ └── learning_progress.csv\n │ ├── 2025/\n │ │ └── learning_roadmap.md\n │ ├── activities/\n │ │ └── study_notes.py\n │ ├── research/\n │ │ └── research_topics.md\n │ ├── schedule/\n │ │ └── weekly_schedule.csv\n │ └── learning_goals.md\n ├── music/\n │ ├── beni/\n │ │ └── playlist_manager.py\n │ ├── jay_chou/\n │ │ └── favorite_songs.csv\n │ ├── jj_lin/\n │ │ └── top_songs.txt\n │ └── music_collection.md\n ├── old_homebrew/\n │ ├── 2023-09-23_22/\n │ │ ├── opt/\n │ │ └── Users/\n │ └── 2023-09-23_23/\n │ ├── opt/\n │ └── Users/\n ├── play/\n │ ├── game_plan/\n │ │ └── gaming_schedule.md\n │ ├── hongkong_tour/\n │ │ └── travel_itinerary.csv\n │ ├── kit&shoes_collection/\n │ │ └── inventory.py\n │ └── others/\n │ └── entertainment_planner.md\n └── travel_plan/\n ├── travel_bucket_list.md\n └── travel_calculator.py\n",
"stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/desktop/music_report/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Desktop 2 Music Report Task: Music Collection Analysis
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
# Hardcoded expected data from answer.json
EXPECTED_SONGS = [
{"song_name": "晴天", "popularity_score": 2.576},
{"song_name": "七里香", "popularity_score": 2.488},
{"song_name": "江南", "popularity_score": 2.488},
{"song_name": "夜曲", "popularity_score": 2.448},
{"song_name": "一千年以后", "popularity_score": 2.44},
{"song_name": "稻香", "popularity_score": 2.376},
{"song_name": "青花瓷", "popularity_score": 2.336},
{"song_name": "不为谁而作的歌", "popularity_score": 2.32},
{"song_name": "学不会", "popularity_score": 2.304},
{"song_name": "小酒窝", "popularity_score": 2.264},
{"song_name": "可惜没如果", "popularity_score": 2.248},
{"song_name": "修炼爱情", "popularity_score": 2.24},
{"song_name": "背对背拥抱", "popularity_score": 2.24},
{"song_name": "爱笑的眼睛", "popularity_score": 2.232},
{"song_name": "她说", "popularity_score": 2.216},
{"song_name": "简单爱", "popularity_score": 1.952},
{"song_name": "龙卷风", "popularity_score": 1.936},
{"song_name": "双截棍", "popularity_score": 1.92},
{"song_name": "可爱女人", "popularity_score": 1.912},
{"song_name": "星晴", "popularity_score": 1.896}
]
EXPECTED_TOP_5 = ["晴天", "七里香", "江南", "夜曲", "一千年以后"]
def verify_report_file_exists(test_dir: Path) -> bool:
"""Verify that the music_analysis_report.txt file exists."""
report_file = test_dir / "music" / "music_analysis_report.txt"
if not report_file.exists():
print("❌ 'music_analysis_report.txt' file not found in music/ folder")
return False
if not report_file.is_file():
print("❌ 'music_analysis_report.txt' exists but is not a file")
return False
print("✅ 'music_analysis_report.txt' file exists")
return True
def verify_file_content_structure(test_dir: Path) -> bool:
"""Verify that the file has exactly 25 lines."""
report_file = test_dir / "music" / "music_analysis_report.txt"
try:
content = report_file.read_text(encoding='utf-8')
lines = content.strip().split('\n')
if len(lines) != 25:
print(f"❌ File should have exactly 25 lines, but has {len(lines)}")
return False
print("✅ File has exactly 25 lines")
return True
except Exception as e:
print(f"❌ Error reading file content: {e}")
return False
def verify_song_ranking_format(test_dir: Path) -> bool:
"""Verify that lines 1-20 contain songs with scores in correct format."""
report_file = test_dir / "music" / "music_analysis_report.txt"
try:
content = report_file.read_text(encoding='utf-8')
lines = content.strip().split('\n')
# Check lines 1-20 (index 0-19)
for i in range(20):
line = lines[i].strip()
if not line:
print(f"❌ Line {i+1} is empty")
return False
# Check format: songname:popularity_score
if ':' not in line:
print(f"❌ Line {i+1} missing colon separator: '{line}'")
return False
parts = line.split(':', 1)
if len(parts) != 2:
print(f"❌ Line {i+1} has incorrect format: '{line}'")
return False
song_name, score_str = parts
if not song_name.strip():
print(f"❌ Line {i+1} has empty song name: '{line}'")
return False
try:
score = float(score_str.strip())
if score < 0 or score > 5:
print(f"❌ Line {i+1} has invalid score range: {score}")
return False
except ValueError:
print(f"❌ Line {i+1} has invalid score format: '{score_str}'")
return False
print("✅ Lines 1-20 have correct song:score format")
return True
except Exception as e:
print(f"❌ Error checking song ranking format: {e}")
return False
def verify_song_ranking_order_with_tolerance(test_dir: Path) -> bool:
"""Verify that songs are ranked by popularity score in descending order, allowing equal scores to be swapped."""
report_file = test_dir / "music" / "music_analysis_report.txt"
try:
content = report_file.read_text(encoding='utf-8')
lines = content.strip().split('\n')
scores = []
for i in range(20):
line = lines[i].strip()
parts = line.split(':', 1)
score = float(parts[1].strip())
scores.append(score)
# Check if scores are in descending order, allowing equal scores to be adjacent
for i in range(1, len(scores)):
if scores[i] > scores[i-1]:
print(f"❌ Scores not in descending order: {scores[i-1]} < {scores[i]} at line {i+1}")
return False
print("✅ Songs are ranked by popularity score in descending order (allowing equal scores)")
return True
except Exception as e:
print(f"❌ Error checking song ranking order: {e}")
return False
def verify_song_names_match_expected(test_dir: Path) -> bool:
"""Verify that all expected song names are present in the ranking."""
report_file = test_dir / "music" / "music_analysis_report.txt"
try:
content = report_file.read_text(encoding='utf-8')
lines = content.strip().split('\n')
found_songs = []
for i in range(20):
line = lines[i].strip()
song_name = line.split(':', 1)[0].strip()
found_songs.append(song_name)
# Check if all expected songs are present
missing_songs = []
for expected_song in EXPECTED_SONGS:
if expected_song["song_name"] not in found_songs:
missing_songs.append(expected_song["song_name"])
if missing_songs:
print(f"❌ Missing expected songs: {missing_songs}")
return False
print("✅ All expected song names are present")
return True
except Exception as e:
print(f"❌ Error checking song names: {e}")
return False
def verify_popularity_scores_match_expected(test_dir: Path) -> bool:
"""Verify that popularity scores match the expected values."""
report_file = test_dir / "music" / "music_analysis_report.txt"
try:
content = report_file.read_text(encoding='utf-8')
lines = content.strip().split('\n')
score_errors = []
for i in range(20):
line = lines[i].strip()
parts = line.split(':', 1)
song_name = parts[0].strip()
actual_score = float(parts[1].strip())
# Find expected score for this song
expected_score = None
for expected_song in EXPECTED_SONGS:
if expected_song["song_name"] == song_name:
expected_score = expected_song["popularity_score"]
break
if expected_score is not None:
# Allow small floating point precision differences
if abs(actual_score - expected_score) > 0.001:
score_errors.append(f"{song_name}: expected {expected_score}, got {actual_score}")
if score_errors:
print(f"❌ Score mismatches: {score_errors}")
return False
print("✅ All popularity scores match expected values")
return True
except Exception as e:
print(f"❌ Error checking popularity scores: {e}")
return False
def verify_top_5_songs(test_dir: Path) -> bool:
"""Verify that lines 21-25 contain the top 5 song names, allowing equal scores to be in different order."""
report_file = test_dir / "music" / "music_analysis_report.txt"
try:
content = report_file.read_text(encoding='utf-8')
lines = content.strip().split('\n')
# Check lines 21-25 (index 20-24)
found_top_5 = []
for i in range(5):
line_num = i + 21
line = lines[i + 20].strip() # Index 20-24 for lines 21-25
if not line:
print(f"❌ Line {line_num} is empty")
return False
if ':' in line:
print(f"❌ Line {line_num} should not contain colon: '{line}'")
return False
found_top_5.append(line)
# Check if all expected top 5 songs are present (order doesn't matter for equal scores)
missing_songs = []
for expected_song in EXPECTED_TOP_5:
if expected_song not in found_top_5:
missing_songs.append(expected_song)
if missing_songs:
print(f"❌ Missing expected top 5 songs: {missing_songs}")
return False
# Check if the order is valid (allowing equal scores to be swapped)
# Since 七里香 and 江南 both have score 2.488, they can be in either order
valid_orders = [
["晴天", "七里香", "江南", "夜曲", "一千年以后"], # Original order
["晴天", "江南", "七里香", "夜曲", "一千年以后"], # Swapped 七里香 and 江南
]
order_valid = False
for valid_order in valid_orders:
if found_top_5 == valid_order:
order_valid = True
break
if not order_valid:
print(f"❌ Top 5 songs order is invalid. Found: {found_top_5}")
print(f"Expected one of: {valid_orders}")
return False
print("✅ Lines 21-25 contain correct top 5 song names in valid order")
return True
except Exception as e:
print(f"❌ Error checking top 5 songs: {e}")
return False
def verify_no_extra_content(test_dir: Path) -> bool:
"""Verify that the file contains no extra content beyond the 25 lines."""
report_file = test_dir / "music" / "music_analysis_report.txt"
try:
content = report_file.read_text(encoding='utf-8')
lines = content.strip().split('\n')
if len(lines) != 25:
print(f"❌ File should have exactly 25 lines, but has {len(lines)}")
return False
print("✅ File contains exactly 25 lines with no extra content")
return True
except Exception as e:
print(f"❌ Error checking for extra content: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Desktop 2 Music Report Task: Music Collection Analysis...")
# Define verification steps
verification_steps = [
("Report File Exists", verify_report_file_exists),
("File Content Structure", verify_file_content_structure),
("Song Ranking Format", verify_song_ranking_format),
("Song Ranking Order", verify_song_ranking_order_with_tolerance),
("Song Names Match Expected", verify_song_names_match_expected),
("Popularity Scores Match Expected", verify_popularity_scores_match_expected),
("Top 5 Songs", verify_top_5_songs),
("No Extra Content", verify_no_extra_content),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Music collection analysis completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/desktop/project_management/description.md
================================================
Please use FileSystem tools to finish the following task:
1. **Create the main directory structure** in `desktop_2`:
- Create a new directory in main directory called `organized_projects`
- Inside `organized_projects`, create 3 main subdirectories: `experiments`, `learning`, and `personal`
- Inside `experiments`, create 2 subdirectories: `ml_projects` and `data_analysis`
- Inside `learning`, create 2 subdirectories: `progress_tracking` and `resources`
- Inside `personal`, create 2 subdirectories: `entertainment` and `collections`
2. **Move all the Python files** to `experiments/ml_projects/`:
3. **Move all the CSV files** to `experiments/data_analysis/`:
4. **Only Move learning-related markdown files** to `learning/resources/`:
5. **Only Move entertainment planning-related markdown files** to `personal/entertainment/`:
6. **Only Move music collection-related markdown files** to `personal/collections/`:
7. **step 4.5.6 should move all the markdown files.**
8. **Create a project structure documentation file**:
- Create `project_structure.md` in the `organized_projects` directory
- Document the new organization with exact file counts for each subdirectory
- Include a summary of what types of files are in each directory
================================================
FILE: tasks/filesystem/standard/desktop/project_management/meta.json
================================================
{
"task_id": "project_management",
"task_name": "Project Management",
"category_id": "desktop",
"category_name": "Desktop",
"description": "Reorganize scattered desktop files into a structured project directory system based on content type, purpose, and file format analysis.",
"author": "Lingjun Chen",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"file organization"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "desktop/\n ├── exp_logs/\n │ ├── aug/\n │ │ └── augmentation_log.txt\n │ ├── project_1/\n │ │ ├── data.csv\n │ │ ├── model.py\n │ │ └── README.md\n │ ├── project_2/\n │ │ ├── analysis_report.md\n │ │ └── data_analysis.py\n │ ├── sep/\n │ │ └── september_summary.csv\n │ ├── exp_record.md\n │ ├── experiment_summary.md\n │ └── results_record.csv\n ├── learning/\n │ ├── 2024/\n │ │ └── learning_progress.csv\n │ ├── 2025/\n │ │ └── learning_roadmap.md\n │ ├── activities/\n │ │ └── study_notes.py\n │ ├── research/\n │ │ └── research_topics.md\n │ ├── schedule/\n │ │ └── weekly_schedule.csv\n │ └── learning_goals.md\n ├── music/\n │ ├── beni/\n │ │ └── playlist_manager.py\n │ ├── jay_chou/\n │ │ └── favorite_songs.csv\n │ ├── jj_lin/\n │ │ └── top_songs.txt\n │ └── music_collection.md\n ├── old_homebrew/\n │ ├── 2023-09-23_22/\n │ │ ├── opt/\n │ │ └── Users/\n │ └── 2023-09-23_23/\n │ ├── opt/\n │ └── Users/\n ├── play/\n │ ├── game_plan/\n │ │ └── gaming_schedule.md\n │ ├── hongkong_tour/\n │ │ └── travel_itinerary.csv\n │ ├── kit&shoes_collection/\n │ │ └── inventory.py\n │ └── others/\n │ └── entertainment_planner.md\n └── travel_plan/\n ├── travel_bucket_list.md\n └── travel_calculator.py\n",
"stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/desktop/project_management/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Desktop 2 Project Management Task: File Reorganization
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_organized_projects_directory_exists(test_dir: Path) -> bool:
"""Verify that the organized_projects directory exists."""
organized_dir = test_dir / "organized_projects"
if not organized_dir.exists():
print("❌ 'organized_projects' directory not found")
return False
if not organized_dir.is_dir():
print("❌ 'organized_projects' exists but is not a directory")
return False
print("✅ 'organized_projects' directory exists")
return True
def verify_directory_structure(test_dir: Path) -> bool:
"""Verify that all required subdirectories exist."""
organized_dir = test_dir / "organized_projects"
required_dirs = [
"experiments",
"experiments/ml_projects",
"experiments/data_analysis",
"learning",
"learning/progress_tracking",
"learning/resources",
"personal",
"personal/entertainment",
"personal/collections"
]
missing_dirs = []
for dir_path in required_dirs:
full_path = organized_dir / dir_path
if not full_path.exists():
missing_dirs.append(dir_path)
elif not full_path.is_dir():
missing_dirs.append(f"{dir_path} (not a directory)")
if missing_dirs:
print(f"❌ Missing or invalid directories: {missing_dirs}")
return False
print("✅ All required directory structure created correctly")
return True
def verify_python_files_in_ml_projects(test_dir: Path) -> bool:
"""Verify that all Python files are moved to experiments/ml_projects."""
organized_dir = test_dir / "organized_projects"
ml_projects_dir = organized_dir / "experiments" / "ml_projects"
expected_python_files = [
"study_notes.py",
"model.py",
"data_analysis.py",
"travel_calculator.py",
"inventory.py",
"playlist_manager.py"
]
missing_files = []
for filename in expected_python_files:
file_path = ml_projects_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing Python files in ml_projects: {missing_files}")
return False
print("✅ All Python files moved to experiments/ml_projects")
return True
def verify_csv_files_in_data_analysis(test_dir: Path) -> bool:
"""Verify that all CSV files are moved to experiments/data_analysis."""
organized_dir = test_dir / "organized_projects"
data_analysis_dir = organized_dir / "experiments" / "data_analysis"
expected_csv_files = [
"learning_progress.csv",
"weekly_schedule.csv",
"results_record.csv",
"september_summary.csv",
"data.csv",
"favorite_songs.csv",
"travel_itinerary.csv"
]
missing_files = []
for filename in expected_csv_files:
file_path = data_analysis_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing CSV files in data_analysis: {missing_files}")
return False
print("✅ All CSV files moved to experiments/data_analysis")
return True
def verify_learning_md_files_in_resources(test_dir: Path) -> bool:
"""Verify that learning-related markdown files are moved to learning/resources."""
organized_dir = test_dir / "organized_projects"
resources_dir = organized_dir / "learning" / "resources"
expected_learning_files = [
"learning_roadmap.md",
"research_topics.md",
"experiment_summary.md",
"exp_record.md",
"README.md",
"analysis_report.md",
"learning_goals.md"
]
missing_files = []
for filename in expected_learning_files:
file_path = resources_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing learning markdown files in resources: {missing_files}")
return False
print("✅ All learning markdown files moved to learning/resources")
return True
def verify_entertainment_md_files_in_entertainment(test_dir: Path) -> bool:
"""Verify that entertainment planning markdown files are moved to personal/entertainment."""
organized_dir = test_dir / "organized_projects"
entertainment_dir = organized_dir / "personal" / "entertainment"
expected_entertainment_files = [
"gaming_schedule.md",
"entertainment_planner.md",
"travel_bucket_list.md"
]
missing_files = []
for filename in expected_entertainment_files:
file_path = entertainment_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing entertainment markdown files in entertainment: {missing_files}")
return False
print("✅ All entertainment markdown files moved to personal/entertainment")
return True
def verify_music_md_files_in_collections(test_dir: Path) -> bool:
"""Verify that music collection markdown files are moved to personal/collections."""
organized_dir = test_dir / "organized_projects"
collections_dir = organized_dir / "personal" / "collections"
expected_music_files = [
"music_collection.md"
]
missing_files = []
for filename in expected_music_files:
file_path = collections_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing music collection markdown files in collections: {filename}")
return False
print("✅ All music collection markdown files moved to personal/collections")
return True
def verify_progress_tracking_empty(test_dir: Path) -> bool:
"""Verify that progress_tracking directory is empty."""
organized_dir = test_dir / "organized_projects"
progress_dir = organized_dir / "learning" / "progress_tracking"
files_in_progress = list(progress_dir.iterdir())
if files_in_progress:
print(f"❌ progress_tracking directory should be empty, but contains: {[f.name for f in files_in_progress]}")
return False
print("✅ progress_tracking directory is correctly empty")
return True
def verify_project_structure_file_exists(test_dir: Path) -> bool:
"""Verify that project_structure.md file exists."""
organized_dir = test_dir / "organized_projects"
structure_file = organized_dir / "project_structure.md"
if not structure_file.exists():
print("❌ 'project_structure.md' file not found")
return False
if not structure_file.is_file():
print("❌ 'project_structure.md' exists but is not a file")
return False
print("✅ 'project_structure.md' file exists")
return True
def verify_file_counts(test_dir: Path) -> bool:
"""Verify that each directory has the correct number of files."""
organized_dir = test_dir / "organized_projects"
expected_counts = {
"experiments/ml_projects": 6, # 6 Python files
"experiments/data_analysis": 7, # 7 CSV files
"learning/resources": 7, # 7 learning markdown files
"learning/progress_tracking": 0, # 0 files (empty)
"personal/entertainment": 3, # 3 entertainment markdown files
"personal/collections": 1 # 1 music collection markdown file
}
incorrect_counts = []
for dir_path, expected_count in expected_counts.items():
full_path = organized_dir / dir_path
actual_count = len([f for f in full_path.iterdir() if f.is_file()])
if actual_count != expected_count:
incorrect_counts.append(f"{dir_path}: expected {expected_count}, got {actual_count}")
if incorrect_counts:
print(f"❌ Incorrect file counts: {incorrect_counts}")
return False
print("✅ All directories have correct file counts")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Desktop 2 Project Management Task: File Reorganization...")
# Define verification steps
verification_steps = [
("Organized Projects Directory Exists", verify_organized_projects_directory_exists),
("Directory Structure", verify_directory_structure),
("Python Files in ML Projects", verify_python_files_in_ml_projects),
("CSV Files in Data Analysis", verify_csv_files_in_data_analysis),
("Learning Markdown Files in Resources", verify_learning_md_files_in_resources),
("Entertainment Markdown Files in Entertainment", verify_entertainment_md_files_in_entertainment),
("Music Collection Files in Collections", verify_music_md_files_in_collections),
("Progress Tracking Empty", verify_progress_tracking_empty),
("Project Structure File Exists", verify_project_structure_file_exists),
("File Counts", verify_file_counts),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Desktop 2 project reorganization completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/desktop/timeline_extraction/description.md
================================================
Please use FileSystem tools to finish the following task:
Read all the files under current path, extract every time/plan information that clearly indicates 2024, and integrate them into a list and create a file in main directory called `timeline.txt`. Write the timeline in the file in the following format.
### Rules
- If a task only shows month without day, use the 1st day of that month
- If a task only shows year without month and day, skip it.
- If a file shows multiple tasks on the same date, count only once per date
### Output Format
- Each line format: `file_path:time`
- `file_path`: The file path where this time information appears (**relative to the current path**)
- `time`: Specific time, if it's a time period, write the start time (YYYY-MM-DD)
### Sorting Requirements
- Sort by chronological order
================================================
FILE: tasks/filesystem/standard/desktop/timeline_extraction/meta.json
================================================
{
"task_id": "timeline_extraction",
"task_name": "Timeline Extraction",
"category_id": "desktop",
"category_name": "Desktop",
"description": "Extract temporal event information from various desktop files and compile a comprehensive chronological timeline of activities and milestones.",
"author": "Lingjun Chen",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"data extraction",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "desktop/\n ├── exp_logs/\n │ ├── aug/\n │ │ └── augmentation_log.txt\n │ ├── project_1/\n │ │ ├── data.csv\n │ │ ├── model.py\n │ │ └── README.md\n │ ├── project_2/\n │ │ ├── analysis_report.md\n │ │ └── data_analysis.py\n │ ├── sep/\n │ │ └── september_summary.csv\n │ ├── exp_record.md\n │ ├── experiment_summary.md\n │ └── results_record.csv\n ├── learning/\n │ ├── 2024/\n │ │ └── learning_progress.csv\n │ ├── 2025/\n │ │ └── learning_roadmap.md\n │ ├── activities/\n │ │ └── study_notes.py\n │ ├── research/\n │ │ └── research_topics.md\n │ ├── schedule/\n │ │ └── weekly_schedule.csv\n │ └── learning_goals.md\n ├── music/\n │ ├── beni/\n │ │ └── playlist_manager.py\n │ ├── jay_chou/\n │ │ └── favorite_songs.csv\n │ ├── jj_lin/\n │ │ └── top_songs.txt\n │ └── music_collection.md\n ├── old_homebrew/\n │ ├── 2023-09-23_22/\n │ │ ├── opt/\n │ │ └── Users/\n │ └── 2023-09-23_23/\n │ ├── opt/\n │ └── Users/\n ├── play/\n │ ├── game_plan/\n │ │ └── gaming_schedule.md\n │ ├── hongkong_tour/\n │ │ └── travel_itinerary.csv\n │ ├── kit&shoes_collection/\n │ │ └── inventory.py\n │ └── others/\n │ └── entertainment_planner.md\n └── travel_plan/\n ├── travel_bucket_list.md\n └── travel_calculator.py\n",
"stateUrl": "https://storage.mcpmark.ai/filesystem/desktop.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/desktop/timeline_extraction/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Desktop 2 Timeline Extraction Task
"""
import sys
from pathlib import Path
import os
import re
from datetime import datetime
from typing import List, Tuple, Set
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_timeline_file_exists(test_dir: Path) -> bool:
"""Verify that the timeline.txt file exists in the main directory."""
timeline_file = test_dir / "timeline.txt"
if not timeline_file.exists():
print("❌ 'timeline.txt' file not found in main directory")
return False
if not timeline_file.is_file():
print("❌ 'timeline.txt' exists but is not a file")
return False
print("✅ 'timeline.txt' file exists in main directory")
return True
def verify_timeline_file_readable(test_dir: Path) -> bool:
"""Verify that the timeline.txt file is readable."""
timeline_file = test_dir / "timeline.txt"
try:
content = timeline_file.read_text(encoding='utf-8')
if not content.strip():
print("❌ 'timeline.txt' file is empty")
return False
print("✅ 'timeline.txt' file is readable")
return True
except Exception as e:
print(f"❌ Error reading 'timeline.txt' file: {e}")
return False
def verify_line_count(test_dir: Path) -> bool:
"""Verify that the timeline.txt file has exactly 43 lines."""
timeline_file = test_dir / "timeline.txt"
try:
content = timeline_file.read_text(encoding='utf-8')
lines = [line.strip() for line in content.split('\n') if line.strip()]
if len(lines) != 43:
print(f"❌ Expected 43 lines, but found {len(lines)} lines")
return False
print(f"✅ File contains exactly {len(lines)} lines")
return True
except Exception as e:
print(f"❌ Error checking line count: {e}")
return False
def verify_line_format(test_dir: Path) -> bool:
"""Verify that each line contains both file path and date time information."""
timeline_file = test_dir / "timeline.txt"
try:
content = timeline_file.read_text(encoding='utf-8')
lines = [line.strip() for line in content.split('\n') if line.strip()]
# More flexible pattern: just check if line contains both path-like content and date-like content
date_pattern = r'\d{4}-\d{2}-\d{2}' # YYYY-MM-DD format
invalid_lines = []
for i, line in enumerate(lines, 1):
# Check if line contains a date
if not re.search(date_pattern, line):
invalid_lines.append(f"Line {i}: '{line}' (no valid date found)")
continue
# Check if line contains path-like content (contains '/' or '.' and not just a date)
# More flexible: look for path anywhere in the line, not just at the beginning
path_found = False
# Split line into words and look for path-like content
words = line.split()
for word in words:
# Check if word looks like a file path (contains '/' or '.' and not just a date)
if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word.strip()):
path_found = True
break
# Also check if line contains path-like content with colon separator
if ':' in line:
parts = line.split(':')
for part in parts:
if ('/' in part or '.' in part) and not re.match(r'^\d{4}-\d{2}-\d{2}$', part.strip()):
path_found = True
break
if not path_found:
invalid_lines.append(f"Line {i}: '{line}' (no valid path found)")
continue
if invalid_lines:
print(f"❌ Invalid line format found: {invalid_lines[:5]}...")
return False
print("✅ All lines contain both file path and date time information")
return True
except Exception as e:
print(f"❌ Error checking line format: {e}")
return False
def verify_date_format(test_dir: Path) -> bool:
"""Verify that all dates are in valid YYYY-MM-DD format."""
timeline_file = test_dir / "timeline.txt"
try:
content = timeline_file.read_text(encoding='utf-8')
lines = [line.strip() for line in content.split('\n') if line.strip()]
invalid_dates = []
for i, line in enumerate(lines, 1):
try:
# Find date pattern in the line (more flexible)
date_match = re.search(r'\d{4}-\d{2}-\d{2}', line)
if not date_match:
invalid_dates.append(f"Line {i}: '{line}' (no date found)")
continue
date_part = date_match.group()
datetime.strptime(date_part, '%Y-%m-%d')
except (IndexError, ValueError) as e:
invalid_dates.append(f"Line {i}: '{line}' (invalid date: {e})")
if invalid_dates:
print(f"❌ Invalid date format found: {invalid_dates[:5]}...")
return False
print("✅ All dates are in valid YYYY-MM-DD format")
return True
except Exception as e:
print(f"❌ Error checking date format: {e}")
return False
def verify_chronological_order(test_dir: Path) -> bool:
"""Verify that dates are in chronological order."""
timeline_file = test_dir / "timeline.txt"
try:
content = timeline_file.read_text(encoding='utf-8')
lines = [line.strip() for line in content.split('\n') if line.strip()]
dates = []
for line in lines:
# Find date pattern in the line (more flexible)
date_match = re.search(r'\d{4}-\d{2}-\d{2}', line)
if date_match:
date_obj = datetime.strptime(date_match.group(), '%Y-%m-%d')
dates.append(date_obj)
# Check if dates are in ascending order
for i in range(1, len(dates)):
if dates[i] < dates[i-1]:
print(f"❌ Date order violation: {dates[i-1].strftime('%Y-%m-%d')} comes after {dates[i].strftime('%Y-%m-%d')}")
return False
print("✅ All dates are in chronological order")
return True
except Exception as e:
print(f"❌ Error checking chronological order: {e}")
return False
def verify_expected_entries(test_dir: Path) -> bool:
"""Verify that all expected entries from answer.txt are present."""
timeline_file = test_dir / "timeline.txt"
try:
content = timeline_file.read_text(encoding='utf-8')
actual_lines = [line.strip() for line in content.split('\n') if line.strip()]
# Expected entries from answer.txt
expected_entries = {
"exp_logs/project_2/analysis_report.md:2024-01-01",
"learning/2024/learning_progress.csv:2024-01-01",
"exp_logs/experiment_summary.md:2024-01-05",
"play/kit&shoes_collection/inventory.py:2024-01-05",
"exp_logs/experiment_summary.md:2024-01-10",
"play/kit&shoes_collection/inventory.py:2024-01-10",
"exp_logs/aug/augmentation_log.txt:2024-01-15",
"exp_logs/experiment_summary.md:2024-01-15",
"play/kit&shoes_collection/inventory.py:2024-01-15",
"learning/2024/learning_progress.csv:2024-02-01",
"learning/2024/learning_progress.csv:2024-03-01",
"play/hongkong_tour/travel_itinerary.csv:2024-03-15",
"travel_plan/travel_calculator.py:2024-03-15",
"play/hongkong_tour/travel_itinerary.csv:2024-03-16",
"play/hongkong_tour/travel_itinerary.csv:2024-03-17",
"play/hongkong_tour/travel_itinerary.csv:2024-03-18",
"play/hongkong_tour/travel_itinerary.csv:2024-03-19",
"play/hongkong_tour/travel_itinerary.csv:2024-03-20",
"travel_plan/travel_bucket_list.md:2024-04-01",
"learning/2024/learning_progress.csv:2024-04-01",
"learning/2024/learning_progress.csv:2024-05-01",
"travel_plan/travel_bucket_list.md:2024-06-01",
"learning/2024/learning_progress.csv:2024-06-01",
"learning/2024/learning_progress.csv:2024-07-01",
"exp_logs/exp_record.md:2024-08-01",
"exp_logs/results_record.csv:2024-08-01",
"travel_plan/travel_bucket_list.md:2024-08-01",
"learning/2024/learning_progress.csv:2024-08-01",
"exp_logs/results_record.csv:2024-08-02",
"exp_logs/results_record.csv:2024-08-03",
"exp_logs/results_record.csv:2024-08-04",
"exp_logs/exp_record.md:2024-09-01",
"exp_logs/sep/september_summary.csv:2024-09-01",
"learning/2024/learning_progress.csv:2024-09-01",
"exp_logs/sep/september_summary.csv:2024-09-05",
"exp_logs/sep/september_summary.csv:2024-09-10",
"exp_logs/sep/september_summary.csv:2024-09-15",
"exp_logs/sep/september_summary.csv:2024-09-20",
"exp_logs/sep/september_summary.csv:2024-09-25",
"exp_logs/sep/september_summary.csv:2024-09-30",
"learning/2024/learning_progress.csv:2024-10-01",
"learning/2024/learning_progress.csv:2024-11-01",
"learning/2024/learning_progress.csv:2024-12-01"
}
# Check if each expected entry is found in actual lines (more flexible matching)
missing_entries = []
for expected in expected_entries:
expected_path, expected_date = expected.split(':')
found = False
for actual_line in actual_lines:
# Check if line contains both the expected path and date
# More flexible: path can be anywhere in the line, not just at the beginning
if expected_path in actual_line and expected_date in actual_line:
found = True
break
if not found:
missing_entries.append(expected)
# Check for extra entries (lines that don't match any expected pattern)
extra_entries = []
for actual_line in actual_lines:
# Extract date from actual line
date_match = re.search(r'\d{4}-\d{2}-\d{2}', actual_line)
if not date_match:
continue
actual_date = date_match.group()
# Try to extract file path from the line
actual_path = None
words = actual_line.split()
for word in words:
if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word.strip()):
actual_path = word
break
if not actual_path:
continue
# Find if this line matches any expected entry
found_expected = False
for expected in expected_entries:
expected_path, expected_date = expected.split(':')
if expected_path in actual_path and expected_date == actual_date:
found_expected = True
break
if not found_expected:
extra_entries.append(actual_line)
if missing_entries:
print(f"❌ Missing {len(missing_entries)} expected entries")
print(f" Examples: {missing_entries[:3]}")
return False
if extra_entries:
print(f"❌ Found {len(extra_entries)} unexpected entries")
print(f" Examples: {extra_entries[:3]}")
return False
print("✅ All expected entries are present, no extra entries")
return True
except Exception as e:
print(f"❌ Error checking expected entries: {e}")
return False
def verify_no_duplicates(test_dir: Path) -> bool:
"""Verify that there are no duplicate entries."""
timeline_file = test_dir / "timeline.txt"
try:
content = timeline_file.read_text(encoding='utf-8')
lines = [line.strip() for line in content.split('\n') if line.strip()]
if len(lines) != len(set(lines)):
print("❌ Duplicate entries found in timeline.txt")
return False
print("✅ No duplicate entries found")
return True
except Exception as e:
print(f"❌ Error checking for duplicates: {e}")
return False
def verify_file_paths_exist(test_dir: Path) -> bool:
"""Verify that all file paths mentioned in timeline.txt actually exist."""
timeline_file = test_dir / "timeline.txt"
try:
content = timeline_file.read_text(encoding='utf-8')
lines = [line.strip() for line in content.split('\n') if line.strip()]
missing_files = []
for line in lines:
# Try to extract file path from the line (more flexible)
file_path_found = False
# Method 1: Split by colon and check each part
if ':' in line:
parts = line.split(':')
for part in parts:
part = part.strip()
if part and ('/' in part or '.' in part) and not re.match(r'^\d{4}-\d{2}-\d{2}$', part):
# This looks like a file path
full_path = test_dir / part
if not full_path.exists():
missing_files.append(part)
file_path_found = True
break
# Method 2: Split into words and look for path-like content
if not file_path_found:
words = line.split()
for word in words:
word = word.strip()
if ('/' in word or '.' in word) and not re.match(r'^\d{4}-\d{2}-\d{2}$', word):
# This looks like a file path
full_path = test_dir / word
if not full_path.exists():
missing_files.append(word)
file_path_found = True
break
# Method 3: Look for path pattern in the entire line
if not file_path_found:
# Use regex to find path-like patterns
path_pattern = r'[a-zA-Z0-9_\-\.\/]+/[a-zA-Z0-9_\-\.\/]+'
path_matches = re.findall(path_pattern, line)
for match in path_matches:
if '.' in match or '/' in match:
full_path = test_dir / match
if not full_path.exists():
missing_files.append(match)
file_path_found = True
break
if missing_files:
print(f"❌ {len(missing_files)} referenced files do not exist")
print(f" Examples: {missing_files[:3]}")
return False
print("✅ All referenced file paths exist")
return True
except Exception as e:
print(f"❌ Error checking file paths: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Desktop Timeline Extraction Task...")
# Define verification steps
verification_steps = [
("Timeline File Exists", verify_timeline_file_exists),
("File is Readable", verify_timeline_file_readable),
("Correct Line Count", verify_line_count),
("Line Format", verify_line_format),
("Date Format", verify_date_format),
("Chronological Order", verify_chronological_order),
("Expected Entries", verify_expected_entries),
("No Duplicates", verify_no_duplicates),
("File Paths Exist", verify_file_paths_exist),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Desktop 2 Timeline Extraction completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/desktop_template/budget_computation/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
You need to analyze all the files in the desktop environment to calculate personal life expenses and create a budget summary.
### Task Objectives
1. **Locate and analyze all files** in the desktop environment
2. **Extract personal life expenses** from the files (such as salary, food, living material, tax, expenses on the internet, ...) (exclude expenses in project/work)
3. **Create a file named `total_budget.txt`** in the main directory
4. **Format each expense entry** as `file_path;price` (one per line)
5. **Add total sum** as the last line, rounded to 2 decimal places
### Output Format
The `total_budget.txt` file should contain:
- One expense per line in format: `file_path;price`
- File path should be the relative path from the main directory
- Price should be rounded to 2 decimal places
- Last line should be the total sum
- No additional text or explanations
### Important Notes
- Only include personal life expenses (not in project/work)
- Use the cheapest available price when multiple options exist for one thing
- The total should match the sum of all individual expenses
- Hint: If a file contains 1 item for personal consumption, it means that all the entry in entire file is for personal consumption
================================================
FILE: tasks/filesystem/standard/desktop_template/budget_computation/meta.json
================================================
{
"task_id": "budget_computation",
"task_name": "Budget Computation",
"category_id": "desktop_template",
"category_name": "Desktop Template",
"description": "Analyze personal expense data extracted from desktop files to create a detailed budget summary report for financial review.",
"author": "Lingjun Chen",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"data extraction",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "desktop_template/\n ├── Archives/\n │ ├── backup_contacts.csv\n │ └── tax_documents_2022.csv\n ├── Desktop/\n │ └── contacts.csv\n ├── Documents/\n │ ├── Personal/\n │ │ └── tax_info_2023.csv\n │ ├── Projects/\n │ │ └── budget_tracker.csv\n │ ├── Work/\n │ │ ├── client_list.csv\n │ │ └── timesheet.csv\n │ ├── budget.csv\n │ └── important_dates.csv\n ├── Downloads/\n │ ├── expenses.csv\n │ ├── fitness_log.csv\n │ └── price_comparisons.csv\n ├── Temp/\n │ └── test_data.csv\n ├── book_list.txt\n ├── bookmark_export.txt\n ├── calculations.txt\n ├── correspondence_2023.txt\n ├── draft_letter.txt\n ├── emergency_contacts.txt\n ├── example.txt\n └── experiment_results.txt\n",
"stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/desktop_template/budget_computation/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Budget Computation Task
"""
import sys
from pathlib import Path
import os
from collections import Counter
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_total_budget_file_exists(test_dir: Path) -> bool:
"""Verify that the total_budget.txt file exists."""
budget_file = test_dir / "total_budget.txt"
if not budget_file.exists():
print("❌ File 'total_budget.txt' not found")
return False
print("✅ total_budget.txt file found")
return True
def verify_file_format(test_dir: Path) -> bool:
"""Verify that the total_budget.txt file has proper format."""
budget_file = test_dir / "total_budget.txt"
try:
content = budget_file.read_text()
lines = [line.strip() for line in content.split('\n') if line.strip()]
if len(lines) < 2:
print("❌ File must contain at least 2 lines (expenses + total)")
return False
# Check that all lines except the last follow the format file_path;price
for i, line in enumerate(lines[:-1]):
if ';' not in line:
print(f"❌ Line {i+1} does not contain ';' separator: {line}")
return False
parts = line.split(';')
if len(parts) != 2:
print(f"❌ Line {i+1} does not have exactly 2 parts: {line}")
return False
# Check if second part is a valid number
try:
float(parts[1])
except ValueError:
print(f"❌ Line {i+1} price is not a valid number: {parts[1]}")
return False
# Check if last line is a valid number (total)
try:
float(lines[-1])
except ValueError:
print(f"❌ Last line is not a valid number: {lines[-1]}")
return False
print("✅ File format is correct")
return True
except Exception as e:
print(f"❌ Error reading or parsing file: {e}")
return False
def verify_expense_entries(test_dir: Path) -> bool:
"""Verify that all 15 required expense entries are present."""
budget_file = test_dir / "total_budget.txt"
try:
content = budget_file.read_text()
lines = [line.strip() for line in content.split('\n') if line.strip()]
# Should have 16 lines total (15 expenses + 1 total)
if len(lines) != 16:
print(f"❌ Expected 16 lines (15 expenses + 1 total), found {len(lines)}")
return False
# Check that we have exactly 15 expense entries
expense_lines = lines[:-1] # All lines except the last
if len(expense_lines) != 15:
print(f"❌ Expected 15 expense entries, found {len(expense_lines)}")
return False
print("✅ File contains exactly 15 expense entries")
return True
except Exception as e:
print(f"❌ Error checking expense entries: {e}")
return False
def verify_file_paths_and_counts(test_dir: Path) -> bool:
"""Verify that all required file paths are present with correct counts."""
budget_file = test_dir / "total_budget.txt"
try:
content = budget_file.read_text()
lines = [line.strip() for line in content.split('\n') if line.strip()]
expense_lines = lines[:-1] # All lines except the last
# Extract file paths from expense lines
file_paths = []
for line in expense_lines:
file_path = line.split(';')[0]
file_paths.append(file_path)
# Count occurrences of each path
path_counts = Counter(file_paths)
# Expected file paths and their counts based on answer.txt
expected_paths = {
'Archives/tax_documents_2022.csv': 3,
'Documents/Personal/tax_info_2023.csv': 3,
'Documents/budget.csv': 3,
'Downloads/expenses.csv': 3,
'Downloads/price_comparisons.csv': 3
}
# Helper function to check if a path contains the expected path
def path_matches_expected(actual_path: str, expected_path: str) -> bool:
"""Check if actual path contains the expected path (allowing for prefixes like './')"""
# Remove common prefixes like './', '../', etc.
normalized_actual = actual_path
while normalized_actual.startswith('./') or normalized_actual.startswith('../'):
normalized_actual = normalized_actual[2:] if normalized_actual.startswith('./') else normalized_actual[3:]
# Check if the normalized path contains the expected path
return expected_path in normalized_actual or normalized_actual == expected_path
# Check if all expected paths are present with correct counts
for expected_path, expected_count in expected_paths.items():
# Find matching actual paths
matching_paths = []
for actual_path in path_counts.keys():
if path_matches_expected(actual_path, expected_path):
matching_paths.append(actual_path)
if not matching_paths:
print(f"❌ Missing expected file path: {expected_path}")
return False
# Sum up the counts from all matching paths
total_count = sum(path_counts[path] for path in matching_paths)
if total_count != expected_count:
print(f"❌ Path {expected_path} has wrong count: expected {expected_count}, found {total_count}")
print(f" Matching paths: {matching_paths}")
return False
# Check if there are any completely unexpected paths (not matching any expected path)
all_matching_paths = set()
for expected_path in expected_paths.keys():
for actual_path in path_counts.keys():
if path_matches_expected(actual_path, expected_path):
all_matching_paths.add(actual_path)
unexpected_paths = set(path_counts.keys()) - all_matching_paths
if unexpected_paths:
print(f"❌ Unexpected file paths found: {unexpected_paths}")
return False
print("✅ All expected file paths are present with correct counts")
return True
except Exception as e:
print(f"❌ Error checking file paths: {e}")
return False
def verify_individual_prices(test_dir: Path) -> bool:
"""Verify that all individual prices match the expected values."""
budget_file = test_dir / "total_budget.txt"
try:
content = budget_file.read_text()
lines = [line.strip() for line in content.split('\n') if line.strip()]
expense_lines = lines[:-1] # All lines except the last
# Expected prices based on answer.txt
expected_expenses = [
('Archives/tax_documents_2022.csv', 42000.00),
('Archives/tax_documents_2022.csv', 1800.00),
('Archives/tax_documents_2022.csv', 950.00),
('Documents/Personal/tax_info_2023.csv', 45000.00),
('Documents/Personal/tax_info_2023.csv', 2500.00),
('Documents/Personal/tax_info_2023.csv', 1200.00),
('Documents/budget.csv', 250.00),
('Documents/budget.csv', 180.00),
('Documents/budget.csv', 120.00),
('Downloads/expenses.csv', 45.99),
('Downloads/expenses.csv', 99.00),
('Downloads/expenses.csv', 234.50),
('Downloads/price_comparisons.csv', 879.99),
('Downloads/price_comparisons.csv', 289.99),
('Downloads/price_comparisons.csv', 74.99)
]
# Helper function to check if a path contains the expected path
def path_matches_expected(actual_path: str, expected_path: str) -> bool:
"""Check if actual path contains the expected path (allowing for prefixes like './')"""
# Remove common prefixes like './', '../', etc.
normalized_actual = actual_path
while normalized_actual.startswith('./') or normalized_actual.startswith('../'):
normalized_actual = normalized_actual[2:] if normalized_actual.startswith('./') else normalized_actual[3:]
# Check if the normalized path contains the expected path
return expected_path in normalized_actual or normalized_actual == expected_path
# Parse actual expenses
actual_expenses = []
for line in expense_lines:
parts = line.split(';')
file_path = parts[0]
price = float(parts[1])
actual_expenses.append((file_path, price))
# Create a counter for expected expenses to handle duplicates
expected_expenses_counter = Counter(expected_expenses)
actual_expenses_counter = Counter(actual_expenses)
# Check if all expected expenses are present with correct counts
for expected_expense, expected_count in expected_expenses_counter.items():
expected_path, expected_price = expected_expense
# Find matching actual expenses
matching_expenses = []
for actual_expense, actual_count in actual_expenses_counter.items():
actual_path, actual_price = actual_expense
if path_matches_expected(actual_path, expected_path) and abs(actual_price - expected_price) < 0.01:
matching_expenses.append(actual_expense)
if not matching_expenses:
print(f"❌ Missing expected expense: {expected_expense}")
return False
# Sum up the counts from all matching expenses
total_count = sum(actual_expenses_counter[expense] for expense in matching_expenses)
if total_count != expected_count:
print(f"❌ Expense {expected_expense} has wrong count: expected {expected_count}, found {total_count}")
print(f" Matching expenses: {matching_expenses}")
return False
# Check if there are any completely unexpected expenses (not matching any expected expense)
all_matching_expenses = set()
for expected_expense in expected_expenses_counter.keys():
expected_path, expected_price = expected_expense
for actual_expense in actual_expenses_counter.keys():
actual_path, actual_price = actual_expense
if path_matches_expected(actual_path, expected_path) and abs(actual_price - expected_price) < 0.01:
all_matching_expenses.add(actual_expense)
unexpected_expenses = set(actual_expenses_counter.keys()) - all_matching_expenses
if unexpected_expenses:
print(f"❌ Unexpected expenses found: {unexpected_expenses}")
return False
print("✅ All individual prices match expected values")
return True
except Exception as e:
print(f"❌ Error checking individual prices: {e}")
return False
def verify_total_price(test_dir: Path) -> bool:
"""Verify that the total price is correct."""
budget_file = test_dir / "total_budget.txt"
try:
content = budget_file.read_text()
lines = [line.strip() for line in content.split('\n') if line.strip()]
# Get the total from the last line
total_line = lines[-1]
try:
actual_total = float(total_line)
except ValueError:
print(f"❌ Last line is not a valid number: {total_line}")
return False
# Expected total based on answer.txt
expected_total = 95624.46
if abs(actual_total - expected_total) > 0.01: # Allow small floating point differences
print(f"❌ Expected total {expected_total}, found {actual_total}")
return False
print("✅ Total price is correct")
return True
except Exception as e:
print(f"❌ Error checking total price: {e}")
return False
def verify_total_calculation(test_dir: Path) -> bool:
"""Verify that the total matches the sum of individual expenses."""
budget_file = test_dir / "total_budget.txt"
try:
content = budget_file.read_text()
lines = [line.strip() for line in content.split('\n') if line.strip()]
expense_lines = lines[:-1] # All lines except the last
# Calculate sum of individual expenses
calculated_total = 0.0
for line in expense_lines:
price = float(line.split(';')[1])
calculated_total += price
# Get the stated total from the last line
stated_total = float(lines[-1])
# Check if they match (allow small floating point differences)
if abs(calculated_total - stated_total) > 0.01:
print(f"❌ Total calculation mismatch: calculated {calculated_total:.2f}, stated {stated_total:.2f}")
return False
print("✅ Total calculation is correct")
return True
except Exception as e:
print(f"❌ Error verifying total calculation: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Budget Computation Task...")
# Define verification steps
verification_steps = [
("Total Budget File Exists", verify_total_budget_file_exists),
("File Format", verify_file_format),
("Expense Entries Count", verify_expense_entries),
("File Paths and Counts", verify_file_paths_and_counts),
("Individual Prices", verify_individual_prices),
("Total Price", verify_total_price),
("Total Calculation", verify_total_calculation),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Budget computation task completed successfully!")
print("🎉 All verification steps passed")
print("📊 Summary:")
print(" - 15 expense entries found")
print(" - 5 different file paths covered")
print(" - All individual prices correct")
print(" - Total price: $95,624.46")
print(" - Calculation verified")
sys.exit(0)
else:
print("❌ Budget computation task verification: FAIL")
print("Please check the errors above and ensure all requirements are met")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/desktop_template/contact_information/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
Your task is to compile all contact information from all the files into a single CSV table. You need to extract all people's contact information and organize it systematically.
### Task Objectives
1. **Scan all files** in the directory
2. **Extract contact information** for all individuals and organizations found
3. **Create a CSV file** named `contact_info.csv` in the main directory
4. **Structure the CSV** with the following columns:
- First column: Name (required)
- Second column: Email (required)
- Third column: Phone (required)
- Additional columns: Any other contact information types found
5. **Consolidate information** by merging the same types of information across entries into single columns
6. **Leave cells blank** if specific information is not available for a person/organization
7. Each entry from different files should be processed and listed separately, without any secondary processing.
### Expected Output
- **File name**: `contact_info.csv`
- **Format**: CSV with headers and data rows
### Reasoning Task
After creating the contact_info.csv file, analyze the data to answer:
**What is Charlie Davis's job/profession?**
Hint: focus on the contact information in contact_info.csv.
Write your answer in a file named `answer.txt` in the main directory.
### Important Notes
- Do not modify any existing files
- Only create the two new files: `contact_info.csv` and `answer.txt`
================================================
FILE: tasks/filesystem/standard/desktop_template/contact_information/meta.json
================================================
{
"task_id": "contact_information",
"task_name": "Contact Information",
"category_id": "desktop_template",
"category_name": "Desktop Template",
"description": "Extract contact details from various file formats on desktop and perform reasoning analysis on the collected relationship data.",
"author": "Lingjun Chen",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"data extraction",
"cross-referencing"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "desktop_template/\n ├── Archives/\n │ ├── backup_contacts.csv\n │ └── tax_documents_2022.csv\n ├── Desktop/\n │ └── contacts.csv\n ├── Documents/\n │ ├── Personal/\n │ │ └── tax_info_2023.csv\n │ ├── Projects/\n │ │ └── budget_tracker.csv\n │ ├── Work/\n │ │ ├── client_list.csv\n │ │ └── timesheet.csv\n │ ├── budget.csv\n │ └── important_dates.csv\n ├── Downloads/\n │ ├── expenses.csv\n │ ├── fitness_log.csv\n │ └── price_comparisons.csv\n ├── Temp/\n │ └── test_data.csv\n ├── book_list.txt\n ├── bookmark_export.txt\n ├── calculations.txt\n ├── correspondence_2023.txt\n ├── draft_letter.txt\n ├── emergency_contacts.txt\n ├── example.txt\n └── experiment_results.txt\n",
"stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/desktop_template/contact_information/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Contact Information Compilation Task
"""
import sys
from pathlib import Path
import csv
import os
import re
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_contact_info_csv_exists(test_dir: Path) -> bool:
"""Verify that the contact_info.csv file exists in the main directory."""
contact_file = test_dir / "contact_info.csv"
if not contact_file.exists():
print("❌ File 'contact_info.csv' not found in main directory")
return False
print("✅ contact_info.csv file found")
return True
def verify_answer_txt_exists(test_dir: Path) -> bool:
"""Verify that the answer.txt file exists in the main directory."""
answer_file = test_dir / "answer.txt"
if not answer_file.exists():
print("❌ File 'answer.txt' not found in main directory")
return False
print("✅ answer.txt file found")
return True
def verify_csv_structure(test_dir: Path) -> bool:
"""Verify that the CSV file has the correct structure."""
contact_file = test_dir / "contact_info.csv"
try:
with open(contact_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
rows = list(reader)
if len(rows) < 2: # Need at least header + 1 data row
print("❌ CSV file has insufficient rows")
return False
headers = rows[0]
if not headers:
print("❌ CSV file has no headers")
return False
# Check that Name is the first column
if headers[0].lower() != 'name':
print("❌ First column is not 'Name'")
return False
# Check that Email and Phone are present (order may vary)
header_lower = [h.lower() for h in headers]
if 'email' not in header_lower:
print("❌ 'Email' column not found")
return False
if 'phone' not in header_lower:
print("❌ 'Phone' column not found")
return False
print("✅ CSV structure is correct")
return True
except Exception as e:
print(f"❌ Error reading CSV file: {e}")
return False
def verify_csv_content_accuracy(test_dir: Path) -> bool:
"""Verify that the CSV content contains all required data, regardless of row order or extra entries."""
contact_file = test_dir / "contact_info.csv"
try:
with open(contact_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
rows = list(reader)
# Expected data from answer.csv (hardcoded as required)
expected_data = [
{"Name": "John Smith", "Email": "john@email.com", "Phone": "555-0101", "Status": "", "Industry": ""},
{"Name": "Jane Doe", "Email": "jane@email.com", "Phone": "555-0102", "Status": "", "Industry": ""},
{"Name": "Bob Johnson", "Email": "bob@email.com", "Phone": "555-0103", "Status": "", "Industry": ""},
{"Name": "Alice Brown", "Email": "alice@email.com", "Phone": "555-0201", "Status": "Inactive", "Industry": ""},
{"Name": "Charlie Davis", "Email": "charlie@email.com", "Phone": "555-0202", "Status": "Active", "Industry": ""},
{"Name": "David Wilson", "Email": "david@email.com", "Phone": "555-0203", "Status": "Inactive", "Industry": ""},
{"Name": "Acme Corp", "Email": "acme@corp.com", "Phone": "", "Status": "", "Industry": "Technology"},
{"Name": "Global Inc", "Email": "global@inc.com", "Phone": "", "Status": "", "Industry": "Finance"},
{"Name": "Local Business", "Email": "local@biz.com", "Phone": "", "Status": "", "Industry": "Retail"},
{"Name": "Spouse", "Email": "", "Phone": "+1-555-0124", "Status": "", "Industry": ""},
{"Name": "Parent", "Email": "", "Phone": "+1-555-0125", "Status": "", "Industry": ""},
{"Name": "Sibling", "Email": "", "Phone": "+1-555-0126", "Status": "", "Industry": ""},
{"Name": "Primary Doctor", "Email": "", "Phone": "+1-555-0201", "Status": "", "Industry": ""},
{"Name": "Dentist", "Email": "", "Phone": "+1-555-0202", "Status": "", "Industry": ""},
{"Name": "Pharmacy", "Email": "", "Phone": "+1-555-0203", "Status": "", "Industry": ""}
]
# Convert expected data to a dictionary for easier lookup
# We'll use Name as the key since it should be unique
expected_dict = {}
for entry in expected_data:
expected_dict[entry["Name"]] = entry
# Check each row for accuracy, regardless of order
# Allow extra entries and mixed content
found_entries = set()
extra_entries = []
for i, row in enumerate(rows):
row_name = row.get('Name', '')
if not row_name:
# Skip rows without names (they're not valid entries)
continue
if row_name in expected_dict:
# This is one of our expected entries
if row_name in found_entries:
print(f"❌ Duplicate name found: '{row_name}'")
return False
found_entries.add(row_name)
expected = expected_dict[row_name]
# Check all columns for this entry
for key, expected_value in expected.items():
if key in row:
actual_value = row[key] if row[key] else ""
if actual_value != expected_value:
print(f"❌ Entry '{row_name}', column '{key}': expected '{expected_value}', got '{actual_value}'")
return False
else:
print(f"❌ Entry '{row_name}' missing column '{key}'")
return False
else:
# This is an extra entry - record it for informational purposes
extra_entries.append(row_name)
# Verify all expected entries were found
if len(found_entries) != len(expected_data):
missing = set(expected_dict.keys()) - found_entries
print(f"❌ Missing entries: {missing}")
return False
# Report extra entries if any
if extra_entries:
print(f"ℹ️ Found {len(extra_entries)} extra entries: {extra_entries}")
print(f"✅ CSV content accuracy verified: found all {len(expected_data)} required entries (plus {len(extra_entries)} extra entries)")
return True
except Exception as e:
print(f"❌ Error verifying CSV content: {e}")
return False
def verify_csv_data_completeness(test_dir: Path) -> bool:
"""Verify that all required data is present and no entries are missing."""
contact_file = test_dir / "contact_info.csv"
try:
with open(contact_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
rows = list(reader)
# Check that all expected names are present
expected_names = [
"John Smith", "Jane Doe", "Bob Johnson", "Alice Brown",
"Charlie Davis", "David Wilson", "Acme Corp", "Global Inc",
"Local Business", "Spouse", "Parent", "Sibling",
"Primary Doctor", "Dentist", "Pharmacy"
]
actual_names = [row.get('Name', '') for row in rows if row.get('Name')]
missing_names = set(expected_names) - set(actual_names)
if missing_names:
print(f"❌ Missing names: {missing_names}")
return False
extra_names = set(actual_names) - set(expected_names)
if extra_names:
print(f"⚠️ Extra names found: {extra_names}")
# This is a warning, not an error
print("✅ CSV data completeness verified")
return True
except Exception as e:
print(f"❌ Error checking data completeness: {e}")
return False
def verify_answer_content(test_dir: Path) -> bool:
"""Verify that the answer.txt contains the correct answer about Charlie Davis."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip().lower()
# The answer should contain "dentist" (as per answer.txt)
if "dentist" in content:
print("✅ Answer about Charlie Davis's job is correct")
return True
else:
print(f"❌ Answer does not contain 'dentist'. Found: '{content}'")
return False
except Exception as e:
print(f"❌ Error reading answer.txt: {e}")
return False
def verify_file_locations(test_dir: Path) -> bool:
"""Verify that files are in the correct locations."""
contact_file = test_dir / "contact_info.csv"
answer_file = test_dir / "answer.txt"
# Check that files are in the main directory, not in subdirectories
if contact_file.parent != test_dir:
print(f"❌ contact_info.csv is not in main directory: {contact_file}")
return False
if answer_file.parent != test_dir:
print(f"❌ answer.txt is not in main directory: {answer_file}")
return False
print("✅ Files are in correct locations")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Contact Information Compilation Task...")
# Define verification steps
verification_steps = [
("Contact Info CSV Exists", verify_contact_info_csv_exists),
("Answer TXT Exists", verify_answer_txt_exists),
("Files in Correct Locations", verify_file_locations),
("CSV Structure", verify_csv_structure),
("CSV Content Accuracy (Flexible)", verify_csv_content_accuracy),
("CSV Data Completeness", verify_csv_data_completeness),
("Answer Content", verify_answer_content),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Contact Information Compilation Task completed successfully!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/desktop_template/file_arrangement/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
You are tasked with organizing files on an AI researcher's desktop into a structured folder system. You need to create specific folders and move files to their designated locations according to the provided organization scheme.
### Task Objectives
1. **Create the following folder structure** in the main directory:
- `work/` - for work, research and projects related files
- `life/` - for personal life related files
- `archives/` - for archived files or files with past dates in its file names
- `temp/` - for temporary files, drafts
- `others/` - for files that cannot be classified elsewhere
### Important Notes
- All files must be moved from their current locations to the specified folders
- The `others/` folder is for files that don't fit the other categories
- Do not modify the contents of any files, only move them to the correct locations
- If you are not sure about which folder it should belongs to, you can read the context in the files before making decisions
- **Do not change files' name**
================================================
FILE: tasks/filesystem/standard/desktop_template/file_arrangement/meta.json
================================================
{
"task_id": "file_arrangement",
"task_name": "File Arrangement",
"category_id": "desktop_template",
"category_name": "Desktop Template",
"description": "Classify and organize desktop files into appropriate categories following specified classification rules and naming convention standards.",
"author": "Lingjun Chen",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"file organization"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "desktop_template/\n ├── Archives/\n │ ├── backup_contacts.csv\n │ └── tax_documents_2022.csv\n ├── Desktop/\n │ └── contacts.csv\n ├── Documents/\n │ ├── Personal/\n │ │ └── tax_info_2023.csv\n │ ├── Projects/\n │ │ └── budget_tracker.csv\n │ ├── Work/\n │ │ ├── client_list.csv\n │ │ └── timesheet.csv\n │ ├── budget.csv\n │ └── important_dates.csv\n ├── Downloads/\n │ ├── expenses.csv\n │ ├── fitness_log.csv\n │ └── price_comparisons.csv\n ├── Temp/\n │ └── test_data.csv\n ├── book_list.txt\n ├── bookmark_export.txt\n ├── calculations.txt\n ├── correspondence_2023.txt\n ├── draft_letter.txt\n ├── emergency_contacts.txt\n ├── example.txt\n └── experiment_results.txt\n",
"stateUrl": "https://storage.mcpmark.ai/filesystem/desktop_template.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/desktop_template/file_arrangement/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Desktop File Organization Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_folder_structure(test_dir: Path) -> bool:
"""Verify that all required folders exist."""
required_folders = ["work", "life", "archives", "temp", "others"]
missing_folders = []
for folder in required_folders:
folder_path = test_dir / folder
if not folder_path.exists() or not folder_path.is_dir():
missing_folders.append(folder)
if missing_folders:
print(f"❌ Missing required folders: {missing_folders}")
return False
print("✅ All required folders exist")
return True
def verify_work_folder_files(test_dir: Path) -> bool:
"""Verify that work folder contains the required files."""
work_dir = test_dir / "work"
required_files = [
"client_list.csv",
"timesheet.csv",
"experiment_results.txt",
"budget_tracker.csv",
"expenses.csv"
]
missing_files = []
for file_name in required_files:
file_path = work_dir / file_name
if not file_path.exists():
missing_files.append(file_name)
if missing_files:
print(f"❌ Missing required files in work/ folder: {missing_files}")
return False
# Count total files in work folder for info
total_files = len([f for f in work_dir.iterdir() if f.is_file()])
print(f"✅ All required files found in work/ folder (total: {total_files} files)")
return True
def verify_life_folder_files(test_dir: Path) -> bool:
"""Verify that life folder contains the required files."""
life_dir = test_dir / "life"
required_files = [
"contacts.csv",
"budget.csv",
"fitness_log.csv",
"price_comparisons.csv",
"book_list.txt",
"bookmark_export.txt",
"emergency_contacts.txt"
]
missing_files = []
for file_name in required_files:
file_path = life_dir / file_name
if not file_path.exists():
missing_files.append(file_name)
if missing_files:
print(f"❌ Missing required files in life/ folder: {missing_files}")
return False
# Count total files in life folder for info
total_files = len([f for f in life_dir.iterdir() if f.is_file()])
print(f"✅ All required files found in life/ folder (total: {total_files} files)")
return True
def verify_archives_folder_files(test_dir: Path) -> bool:
"""Verify that archives folder contains the required files."""
archives_dir = test_dir / "archives"
required_files = [
"backup_contacts.csv",
"tax_documents_2022.csv",
"correspondence_2023.txt",
"tax_info_2023.csv"
]
missing_files = []
for file_name in required_files:
file_path = archives_dir / file_name
if not file_path.exists():
missing_files.append(file_name)
if missing_files:
print(f"❌ Missing required files in archives/ folder: {missing_files}")
return False
# Count total files in archives folder for info
total_files = len([f for f in archives_dir.iterdir() if f.is_file()])
print(f"✅ All required files found in archives/ folder (total: {total_files} files)")
return True
def verify_temp_folder_files(test_dir: Path) -> bool:
"""Verify that temp folder contains the required files."""
temp_dir = test_dir / "temp"
required_files = [
"test_data.csv",
"draft_letter.txt"
]
missing_files = []
for file_name in required_files:
file_path = temp_dir / file_name
if not file_path.exists():
missing_files.append(file_name)
if missing_files:
print(f"❌ Missing required files in temp/ folder: {missing_files}")
return False
# Count total files in temp folder for info
total_files = len([f for f in temp_dir.iterdir() if f.is_file()])
print(f"✅ All required files found in temp/ folder (total: {total_files} files)")
return True
def verify_others_folder_files(test_dir: Path) -> bool:
"""Verify that others folder exists and can contain any files."""
others_dir = test_dir / "others"
if not others_dir.exists() or not others_dir.is_dir():
print("❌ others/ folder not found")
return False
# Count files in others folder for info
total_files = len([f for f in others_dir.iterdir() if f.is_file()])
print(f"✅ others/ folder exists (contains {total_files} files)")
return True
def verify_required_files_in_correct_folders(test_dir: Path) -> bool:
"""Verify that all 18 required files are in their correct designated folders."""
# Define the mapping of required files to their correct folders
required_file_mapping = {
"work": [
"client_list.csv",
"timesheet.csv",
"experiment_results.txt",
"budget_tracker.csv",
"expenses.csv",
],
"life": [
"contacts.csv",
"budget.csv",
"fitness_log.csv",
"price_comparisons.csv",
"book_list.txt",
"bookmark_export.txt",
"emergency_contacts.txt"
],
"archives": [
"backup_contacts.csv",
"tax_documents_2022.csv",
"correspondence_2023.txt",
"tax_info_2023.csv"
],
"temp": [
"test_data.csv",
"draft_letter.txt"
]
}
missing_files = []
# Check each required file is in its correct folder
for folder, files in required_file_mapping.items():
folder_path = test_dir / folder
for file_name in files:
file_path = folder_path / file_name
if not file_path.exists():
missing_files.append(f"{folder}/{file_name}")
if missing_files:
print(f"❌ Missing required files: {missing_files}")
return False
print("✅ All 18 required files are in their correct designated folders")
return True
def verify_no_duplicate_required_files(test_dir: Path) -> bool:
"""Verify that the 18 required files are not duplicated across folders."""
required_files = [
"client_list.csv", "timesheet.csv", "experiment_results.txt", "budget_tracker.csv",
"contacts.csv", "budget.csv", "expenses.csv", "fitness_log.csv",
"price_comparisons.csv", "book_list.txt", "bookmark_export.txt", "emergency_contacts.txt",
"backup_contacts.csv", "tax_documents_2022.csv", "correspondence_2023.txt", "tax_info_2023.csv",
"test_data.csv", "draft_letter.txt"
]
# Check for duplicates of required files
file_locations = {}
duplicates = []
for folder in ["work", "life", "archives", "temp", "others"]:
folder_path = test_dir / folder
if folder_path.exists() and folder_path.is_dir():
for file_path in folder_path.iterdir():
if file_path.is_file() and file_path.name in required_files:
if file_path.name in file_locations:
duplicates.append(f"{file_path.name} (in {file_locations[file_path.name]} and {folder}/)")
else:
file_locations[file_path.name] = f"{folder}/"
if duplicates:
print(f"❌ Duplicate required files found: {duplicates}")
return False
print("✅ No duplicate required files found")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Desktop File Organization Task...")
# Define verification steps
verification_steps = [
("Folder Structure", verify_folder_structure),
("Required Files in Work Folder", verify_work_folder_files),
("Required Files in Life Folder", verify_life_folder_files),
("Required Files in Archives Folder", verify_archives_folder_files),
("Required Files in Temp Folder", verify_temp_folder_files),
("Others Folder Exists", verify_others_folder_files),
("All Required Files in Correct Folders", verify_required_files_in_correct_folders),
("No Duplicate Required Files", verify_no_duplicate_required_files),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Desktop file organization task completed successfully!")
print("🎉 All 18 required files are correctly placed in their designated folders")
print("📊 Summary:")
print(" - work/ folder: 5 required files")
print(" - life/ folder: 7 required files")
print(" - archives/ folder: 4 required files")
print(" - temp/ folder: 2 required files")
print(" - others/ folder: can contain any files")
print(" - Total required files: 18")
print(" - Note: Other files can be placed in any folder")
sys.exit(0)
else:
print("❌ Desktop file organization task verification: FAIL")
print("Please check the errors above and ensure all 18 required files are in their correct locations")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/file_context/duplicates_searching/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
You are given a directory containing multiple text files. Some files have identical content and need to be organized. Your task is to identify all files with duplicate content and move them to a newly created 'duplicates' directory.
### Task Objectives
1. **Scan all text files** in the test directory to identify groups with identical content
2. **Create a 'duplicates' directory** in the test directory root
3. **Move all duplicate files** into the 'duplicates' directory
4. **Leave unique files** in their original location
### Expected Output
After completing the task, the directory structure should be:
- `duplicates/` directory containing all files with duplicate content
- Original directory containing only files with unique content
================================================
FILE: tasks/filesystem/standard/file_context/duplicates_searching/meta.json
================================================
{
"task_id": "duplicates_searching",
"task_name": "Duplicates Searching",
"category_id": "file_context",
"category_name": "File Context",
"description": "Scan directory to identify files with identical content, then organize all duplicate files into a separate dedicated directory for cleanup.",
"author": "Lingjun Chen",
"created_at": "2025-08-06",
"difficulty": "L3",
"tags": [
"pattern analysis",
"file organization"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/file_context/duplicates_searching/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Duplicates Detection and Organization Task
"""
import sys
from pathlib import Path
import os
import hashlib
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def calculate_file_hash(file_path: Path) -> str:
"""Calculate MD5 hash of file content."""
try:
with open(file_path, 'rb') as f:
return hashlib.md5(f.read()).hexdigest()
except Exception as e:
print(f"❌ Error reading file {file_path}: {e}")
return None
def verify_duplicates_directory_exists(test_dir: Path) -> bool:
"""Verify that the duplicates directory exists."""
duplicates_dir = test_dir / "duplicates"
if not duplicates_dir.exists():
print("❌ 'duplicates' directory not found")
return False
if not duplicates_dir.is_dir():
print("❌ 'duplicates' exists but is not a directory")
return False
print("✅ 'duplicates' directory exists")
return True
def get_expected_duplicate_groups():
"""Return the expected duplicate file groups based on content analysis."""
# Based on the answer.md and content analysis
return {
# Group 1: file_01.txt, file_02.txt (identical content)
"group1": ["file_01.txt", "file_02.txt"],
# Group 2: file_03.txt, file_04.txt (identical content)
"group2": ["file_03.txt", "file_04.txt"],
# Group 3: file_07.txt, file_08.txt (identical content)
"group3": ["file_07.txt", "file_08.txt"],
# Group 4: file_10.txt, file_11.txt (identical content)
"group4": ["file_10.txt", "file_11.txt"],
# Group 5: file_13.txt, file_14.txt (identical content)
"group5": ["file_13.txt", "file_14.txt"],
# Group 6: file_15.txt, file_16.txt (identical content)
"group6": ["file_15.txt", "file_16.txt"],
# Group 7: file_18.txt, file_19.txt (identical content)
"group7": ["file_18.txt", "file_19.txt"]
}
def get_expected_unique_files():
"""Return the expected unique files that should remain in original location."""
return [
"file_05.txt", "file_06.txt", "file_09.txt",
"file_12.txt", "file_17.txt", "file_20.txt"
]
def verify_duplicate_files_moved(test_dir: Path) -> bool:
"""Verify that all duplicate files have been moved to the duplicates directory."""
duplicates_dir = test_dir / "duplicates"
expected_duplicate_groups = get_expected_duplicate_groups()
# Check that all expected duplicate files are in the duplicates directory
missing_files = []
for group_name, files in expected_duplicate_groups.items():
for filename in files:
file_path = duplicates_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing duplicate files in 'duplicates' directory: {missing_files}")
return False
print("✅ All expected duplicate files are in the 'duplicates' directory")
return True
def verify_unique_files_remain(test_dir: Path) -> bool:
"""Verify that unique files remain in the original location."""
expected_unique_files = get_expected_unique_files()
missing_files = []
for filename in expected_unique_files:
file_path = test_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing unique files in original location: {missing_files}")
return False
print("✅ All expected unique files remain in the original location")
return True
def verify_no_duplicate_files_in_original(test_dir: Path) -> bool:
"""Verify that no duplicate files remain in the original location."""
expected_duplicate_groups = get_expected_duplicate_groups()
remaining_duplicates = []
for group_name, files in expected_duplicate_groups.items():
for filename in files:
file_path = test_dir / filename
if file_path.exists():
remaining_duplicates.append(filename)
if remaining_duplicates:
print(f"❌ Duplicate files still exist in original location: {remaining_duplicates}")
return False
print("✅ No duplicate files remain in the original location")
return True
def verify_content_integrity(test_dir: Path) -> bool:
"""Verify that file content integrity is maintained after moving."""
duplicates_dir = test_dir / "duplicates"
expected_duplicate_groups = get_expected_duplicate_groups()
# Check that files in each duplicate group have identical content
for group_name, files in expected_duplicate_groups.items():
if len(files) < 2:
continue
# Calculate hash of the first file in the group
first_file = duplicates_dir / files[0]
if not first_file.exists():
print(f"❌ First file of group {group_name} not found: {files[0]}")
return False
first_hash = calculate_file_hash(first_file)
if first_hash is None:
return False
# Check that all other files in the group have the same hash
for filename in files[1:]:
file_path = duplicates_dir / filename
if not file_path.exists():
print(f"❌ File in group {group_name} not found: {filename}")
return False
file_hash = calculate_file_hash(file_path)
if file_hash is None:
return False
if file_hash != first_hash:
print(f"❌ Files in group {group_name} have different content: {files[0]} vs {filename}")
return False
print("✅ Content integrity verified - duplicate files have identical content")
return True
def verify_total_file_count(test_dir: Path) -> bool:
"""Verify that the duplicates directory contains exactly 14 files."""
duplicates_dir = test_dir / "duplicates"
# Count files in original location (excluding the duplicates directory itself)
original_files = [f for f in test_dir.iterdir() if f.is_file()]
# Count files in duplicates directory
duplicate_files = [f for f in duplicates_dir.iterdir() if f.is_file()]
# Expected: 14 files in duplicates directory
expected_duplicates = 14
actual_duplicates = len(duplicate_files)
if actual_duplicates != expected_duplicates:
print(f"❌ Wrong number of files in duplicates directory. Expected: {expected_duplicates}, Actual: {actual_duplicates}")
return False
print(f"✅ Duplicates directory has correct number of files: {actual_duplicates}")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying File Duplicates Detection and Organization Task...")
# Define verification steps
verification_steps = [
("Duplicates Directory Exists", verify_duplicates_directory_exists),
("Duplicate Files Moved", verify_duplicate_files_moved),
("Unique Files Remain", verify_unique_files_remain),
("No Duplicates in Original", verify_no_duplicate_files_in_original),
("Content Integrity", verify_content_integrity),
("Duplicates Count", verify_total_file_count),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ File duplicates detection and organization completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/file_context/file_merging/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
You are given a directory containing multiple text files of varying sizes. Your task is to identify the 10 smallest .txt files, merge their content in alphabetical order, and create a consolidated file called "merged_content.txt" with proper formatting.
### Task Objectives
1. **Identify the 10 smallest .txt files** in the test directory
2. **Sort the selected files alphabetically** by filename
3. **Merge the content** of these files into a single file
4. **Add file headers** (file name) before each file's content
================================================
FILE: tasks/filesystem/standard/file_context/file_merging/meta.json
================================================
{
"task_id": "file_merging",
"task_name": "File Merging",
"category_id": "file_context",
"category_name": "File Context",
"description": "Identify the 10 smallest text files in the directory, then merge their content in alphabetical order into a single consolidated file.",
"author": "Lingjun Chen",
"created_at": "2025-08-07",
"difficulty": "L3",
"tags": [
"content transformation",
"file organization"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/file_context/file_merging/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Merging Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def get_expected_files() -> list:
"""Get the expected 10 files in alphabetical order."""
# The 10 smallest files (excluding file_12.txt) in alphabetical order
expected_files = [
"file_10.txt",
"file_11.txt",
"file_13.txt",
"file_14.txt",
"file_15.txt",
"file_16.txt",
"file_17.txt",
"file_18.txt",
"file_19.txt",
"file_20.txt"
]
return expected_files
def verify_merged_file_exists(test_dir: Path) -> bool:
"""Verify that the merged_content.txt file exists."""
merged_file = test_dir / "merged_content.txt"
if not merged_file.exists():
print("❌ File 'merged_content.txt' not found")
return False
print("✅ Merged content file found")
return True
def verify_correct_files_selected(test_dir: Path) -> bool:
"""Verify that the correct 10 files were selected and included."""
expected_files = get_expected_files()
merged_file = test_dir / "merged_content.txt"
try:
content = merged_file.read_text()
# Check if all expected files are present
for expected_file in expected_files:
if expected_file not in content:
print(f"❌ Expected file '{expected_file}' not found in merged content")
return False
# Check if file_12.txt is NOT present (should be excluded)
if "file_12.txt" in content:
print("❌ file_12.txt should be excluded but was found in merged content")
return False
print("✅ Correct files selected and included")
return True
except Exception as e:
print(f"❌ Error verifying file selection: {e}")
return False
def verify_alphabetical_order(test_dir: Path) -> bool:
"""Verify that files are in alphabetical order."""
expected_files = get_expected_files()
merged_file = test_dir / "merged_content.txt"
try:
content = merged_file.read_text()
lines = content.split('\n')
# Extract filenames from the content (lines that contain .txt)
found_files = []
for line in lines:
line = line.strip()
# Check if this line contains any of the expected filenames
for expected_file in expected_files:
if expected_file in line:
found_files.append(expected_file)
break
# Check if files are in alphabetical order
if found_files != expected_files:
print(f"❌ Files not in correct alphabetical order")
print(f" Expected: {expected_files}")
print(f" Found: {found_files}")
return False
print("✅ Files are in correct alphabetical order")
return True
except Exception as e:
print(f"❌ Error verifying alphabetical order: {e}")
return False
def verify_file_content_integrity(test_dir: Path) -> bool:
"""Verify that the content of each file is preserved correctly."""
expected_files = get_expected_files()
merged_file = test_dir / "merged_content.txt"
try:
content = merged_file.read_text()
lines = content.split('\n')
for expected_file in expected_files:
# Get the original file content
original_file = test_dir / expected_file
original_content = original_file.read_text().strip()
# Find the line index where this file's header appears
header_line_index = -1
for i, line in enumerate(lines):
if expected_file in line:
header_line_index = i
break
if header_line_index == -1:
print(f"❌ Could not find header for {expected_file}")
return False
# Find the next header line or end of file
next_header_index = len(lines)
for i in range(header_line_index + 1, len(lines)):
for other_file in expected_files:
if other_file != expected_file and other_file in lines[i]:
next_header_index = i
break
if next_header_index != len(lines):
break
# Extract content lines (from header + 1 to next header)
content_lines = lines[header_line_index + 1:next_header_index]
merged_content = '\n'.join(content_lines).strip()
if merged_content != original_content:
print(f"❌ Content mismatch for {expected_file}")
print(f" Expected: {original_content}")
print(f" Found: {merged_content}")
return False
print("✅ All file contents preserved correctly")
return True
except Exception as e:
print(f"❌ Error verifying content integrity: {e}")
return False
def verify_filename_headers(test_dir: Path) -> bool:
"""Verify that each file section starts with the correct filename header."""
expected_files = get_expected_files()
merged_file = test_dir / "merged_content.txt"
try:
content = merged_file.read_text()
for expected_file in expected_files:
# Check if the filename appears anywhere in the content (as part of a line)
if expected_file not in content:
print(f"❌ Filename header '{expected_file}' not found")
return False
print("✅ All filename headers present and correctly formatted")
return True
except Exception as e:
print(f"❌ Error verifying filename headers: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying File Merging Task...")
# Show expected files for debugging
expected_files = get_expected_files()
print(f"📋 Expected files (10 smallest, excluding file_12.txt): {expected_files}")
# Define verification steps
verification_steps = [
("Merged File Exists", verify_merged_file_exists),
("Correct Files Selected", verify_correct_files_selected),
("Alphabetical Order", verify_alphabetical_order),
("Filename Headers", verify_filename_headers),
("Content Integrity", verify_file_content_integrity),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ File merging task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/file_context/file_splitting/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
You need to split a large text file into multiple smaller files with equal character counts. The task involves creating a new directory and splitting the content into exactly 10 files.
### Task Objectives
1. **Create a new directory** named `split` in the test directory
2. **Split the file** `large_file.txt` into exactly 10 files with equal character counts
3. **Name the files** as `split_01.txt`, `split_02.txt`, ..., `split_10.txt` in the `split` directory
================================================
FILE: tasks/filesystem/standard/file_context/file_splitting/meta.json
================================================
{
"task_id": "file_splitting",
"task_name": "File Splitting",
"category_id": "file_context",
"category_name": "File Context",
"description": "Split a large text file into multiple equal-length segments for easier processing, distribution, and parallel handling of content.",
"author": "Lingjun Chen",
"created_at": "2025-08-08",
"difficulty": "L3",
"tags": [
"content transformation"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/file_context/file_splitting/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Splitting Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_split_directory_exists(test_dir: Path) -> bool:
"""Verify that the split directory exists."""
split_dir = test_dir / "split"
if not split_dir.exists():
print("❌ Directory 'split' not found")
return False
if not split_dir.is_dir():
print("❌ 'split' exists but is not a directory")
return False
print("✅ Split directory found")
return True
def verify_all_split_files_exist(test_dir: Path) -> bool:
"""Verify that all 10 split files exist with correct names."""
split_dir = test_dir / "split"
expected_files = [f"split_{i:02d}.txt" for i in range(1, 11)]
missing_files = []
for filename in expected_files:
file_path = split_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing files: {missing_files}")
return False
print("✅ All 10 split files exist with correct names")
return True
def verify_equal_file_lengths(test_dir: Path) -> bool:
"""Verify that all split files have equal character counts."""
split_dir = test_dir / "split"
file_lengths = []
for i in range(1, 11):
filename = f"split_{i:02d}.txt"
file_path = split_dir / filename
try:
content = file_path.read_text()
file_lengths.append(len(content))
except Exception as e:
print(f"❌ Error reading {filename}: {e}")
return False
# Check if all lengths are equal
if len(set(file_lengths)) != 1:
print(f"❌ File lengths are not equal: {file_lengths}")
return False
print(f"✅ All files have equal length: {file_lengths[0]} characters")
return True
def verify_content_integrity(test_dir: Path) -> bool:
"""Verify that concatenated split files equal the original file."""
split_dir = test_dir / "split"
original_file = test_dir / "large_file.txt"
# Read original content
try:
original_content = original_file.read_text()
except Exception as e:
print(f"❌ Error reading original file: {e}")
return False
# Concatenate all split files
concatenated_content = ""
for i in range(1, 11):
filename = f"split_{i:02d}.txt"
file_path = split_dir / filename
try:
content = file_path.read_text()
concatenated_content += content
except Exception as e:
print(f"❌ Error reading {filename}: {e}")
return False
# Compare content
if concatenated_content != original_content:
print("❌ Concatenated content does not match original file")
print(f" Original length: {len(original_content)}")
print(f" Concatenated length: {len(concatenated_content)}")
return False
print("✅ Concatenated content matches original file exactly")
return True
def verify_no_extra_files(test_dir: Path) -> bool:
"""Verify that no extra files exist in the split directory."""
split_dir = test_dir / "split"
expected_files = {f"split_{i:02d}.txt" for i in range(1, 11)}
actual_files = {f.name for f in split_dir.iterdir() if f.is_file()}
extra_files = actual_files - expected_files
if extra_files:
print(f"❌ Extra files found in split directory: {extra_files}")
return False
print("✅ No extra files in split directory")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying File Splitting Task...")
# Define verification steps
verification_steps = [
("Split Directory Exists", verify_split_directory_exists),
("All Split Files Exist", verify_all_split_files_exist),
("Equal File Lengths", verify_equal_file_lengths),
("Content Integrity", verify_content_integrity),
("No Extra Files", verify_no_extra_files),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ File splitting task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/file_context/pattern_matching/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
Your task is to find all files that contain a substring of 30 or more characters that also appears in `large_file.txt`. **You are not allowed to use python code.**
### Task Objectives
1. **Read the reference file** `large_file.txt` to understand its content
2. **Examine each file** from file_01.txt to file_20.txt
3. **Find files** that contain a substring of 30 or more characters that matches a substring in `large_file.txt`
4. **Create a file `answer.txt`** and write the results to it with the following format:
- One line per matching file
- Format: `filename.txt,start_position`
- Where start_position is the character position (1-indexed) of the matching substring in `large_file.txt`
- Do not add any things else other than `filename.txt,start_position`.
================================================
FILE: tasks/filesystem/standard/file_context/pattern_matching/meta.json
================================================
{
"task_id": "pattern_matching",
"task_name": "Pattern Matching",
"category_id": "file_context",
"category_name": "File Context",
"description": "Search multiple files for shared character sequences and precisely locate all matching pattern occurrences within the target files.",
"author": "Lingjun Chen",
"created_at": "2025-08-06",
"difficulty": "L3",
"tags": [
"pattern analysis",
"cross-referencing"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/file_context/pattern_matching/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Filtering Task: Find Files with Common Substring
"""
import sys
from pathlib import Path
import os
import re
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_answer_file_exists(test_dir: Path) -> bool:
"""Verify that the answer.txt file exists."""
answer_file = test_dir / "answer.txt"
if not answer_file.exists():
print("❌ File 'answer.txt' not found")
return False
print("✅ Answer file found")
return True
def verify_answer_format(test_dir: Path) -> bool:
"""Verify that the answer file has the correct format."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# If file is empty, that's acceptable (no matches found)
if not content:
print("✅ Answer file is empty (no matches found)")
return True
lines = content.split('\n')
for i, line in enumerate(lines, 1):
line = line.strip()
if not line:
continue
# Check format: filename.txt,start_position
parts = line.split(',')
if len(parts) != 2:
print(f"❌ Line {i} has incorrect format: {line}")
print(" Expected format: filename.txt,start_position")
return False
filename, start_pos = parts
# Check filename format
if not filename.endswith('.txt') or not filename.startswith('file_'):
print(f"❌ Line {i} has invalid filename: {filename}")
return False
# Check position format (should be integer)
try:
start_int = int(start_pos)
if start_int <= 0:
print(f"❌ Line {i} has invalid position: {start_pos}")
return False
except ValueError:
print(f"❌ Line {i} has non-integer position: {start_pos}")
return False
print("✅ Answer format is correct")
return True
except Exception as e:
print(f"❌ Error reading answer file: {e}")
return False
def find_30_plus_char_matches(test_dir: Path) -> dict:
"""Find all matches with 30 or more characters between files and large_file.txt."""
large_file = test_dir / "large_file.txt"
if not large_file.exists():
print("❌ large_file.txt not found")
return {}
large_content = large_file.read_text()
matches = {}
# Check each file from file_01.txt to file_20.txt
for i in range(1, 21):
filename = f"file_{i:02d}.txt"
file_path = test_dir / filename
if not file_path.exists():
continue
file_content = file_path.read_text()
# Find the longest matching substring (30+ characters)
longest_match = ""
longest_match_start = -1
# Check all possible substrings in the file
for start_pos in range(len(file_content)):
for end_pos in range(start_pos + 30, len(file_content) + 1): # At least 30 characters
substring = file_content[start_pos:end_pos]
# Check if this substring exists in large_file.txt
if substring in large_content:
if len(substring) > len(longest_match):
longest_match = substring
# Find the position in large_file.txt where this substring starts
large_start_pos = large_content.find(substring)
longest_match_start = large_start_pos + 1 # 1-indexed
# If we found a match of 30+ characters, record it
if longest_match and len(longest_match) >= 30:
matches[filename] = longest_match_start
return matches
def verify_matches_are_correct(test_dir: Path) -> bool:
"""Verify that the matches found in answer.txt are actually correct."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# If no content, check if there should actually be no matches
if not content:
expected_matches = find_30_plus_char_matches(test_dir)
if expected_matches:
print("❌ Answer file is empty but matches should exist")
for filename, start_pos in expected_matches.items():
print(f" Expected: {filename},{start_pos}")
return False
else:
print("✅ No matches found (correct)")
return True
# Parse answer file
answer_matches = {}
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
filename, start_pos = line.split(',')
answer_matches[filename] = int(start_pos)
# Get expected matches
expected_matches = find_30_plus_char_matches(test_dir)
# Check if all answer matches are correct
for filename, start_pos in answer_matches.items():
if filename not in expected_matches:
print(f"❌ File {filename} listed in answer but has no valid 30+ character match")
return False
expected_start = expected_matches[filename]
if start_pos != expected_start:
print(f"❌ Incorrect match position for {filename}")
print(f" Expected: {expected_start}")
print(f" Found: {start_pos}")
return False
# Check if all expected matches are in answer
for filename in expected_matches:
if filename not in answer_matches:
print(f"❌ Missing match for {filename} in answer file")
return False
print("✅ All matches are correct")
return True
except Exception as e:
print(f"❌ Error verifying matches: {e}")
return False
def verify_match_length_is_30_plus(test_dir: Path) -> bool:
"""Verify that all matches are at least 30 characters long."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
if not content:
return True # No matches to verify
large_file = test_dir / "large_file.txt"
large_content = large_file.read_text()
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
filename, start_pos = line.split(',')
start_int = int(start_pos)
# Get the file content to check the match
file_path = test_dir / filename
file_content = file_path.read_text()
# Find the longest matching substring starting from the given position
longest_match = ""
for end_pos in range(start_int + 30 - 1, len(large_content) + 1): # At least 30 characters
substring = large_content[start_int - 1:end_pos] # Convert to 0-indexed
if substring in file_content:
longest_match = substring
else:
break
if len(longest_match) < 30:
print(f"❌ Match in {filename} is {len(longest_match)} characters, less than 30")
print(f" Starting position: {start_int}")
return False
print("✅ All matches are at least 30 characters long")
return True
except Exception as e:
print(f"❌ Error verifying match lengths: {e}")
return False
def verify_files_exist(test_dir: Path) -> bool:
"""Verify that all files mentioned in answer.txt actually exist."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
if not content:
return True # No files to verify
lines = content.split('\n')
for line in lines:
line = line.strip()
if not line:
continue
filename = line.split(',')[0]
file_path = test_dir / filename
if not file_path.exists():
print(f"❌ File mentioned in answer does not exist: {filename}")
return False
print("✅ All files mentioned in answer exist")
return True
except Exception as e:
print(f"❌ Error verifying file existence: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Pattern Matching Task: Find Files with Common Substring...")
# Define verification steps
verification_steps = [
("Answer File Exists", verify_answer_file_exists),
("Answer Format", verify_answer_format),
("Files Exist", verify_files_exist),
("Match Length is 30+", verify_match_length_is_30_plus),
("Matches are Correct", verify_matches_are_correct),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ File filtering task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/file_context/uppercase/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
You need to process 10 text files (file_01.txt to file_10.txt) and convert their content to uppercase format.
### Task Objectives
1. **Create an uppercase directory** in the test environment root
2. **Convert each file** from file_01.txt to file_10.txt to uppercase
3. **Save converted files** in the uppercase/ directory with the same names
4. **Count words** in each original file (file_01.txt to file_10.txt)
5. **Create answer.txt** with word counts in the specified format.
### Specified Format of answer.txt
Create a file named `answer.txt` in uppercase/
**Requirements:**
- Each line should follow the format: `:`
- Include all 10 files: file_01.txt, file_02.txt, ..., file_10.txt
- Use the exact filename format (file_01.txt, file_02.txt, etc.)
- One entry per line
================================================
FILE: tasks/filesystem/standard/file_context/uppercase/meta.json
================================================
{
"task_id": "uppercase",
"task_name": "Uppercase",
"category_id": "file_context",
"category_name": "File Context",
"description": "Convert the content of 10 specified files to uppercase format and calculate the total word count across all processed files.",
"author": "Lingjun Chen",
"created_at": "2025-08-19",
"difficulty": "L3",
"tags": [
"content transformation",
"data extraction"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_context/\n ├── file_01.txt\n ├── file_02.txt\n ├── file_03.txt\n ├── file_04.txt\n ├── file_05.txt\n ├── file_06.txt\n ├── file_07.txt\n ├── file_08.txt\n ├── file_09.txt\n ├── file_10.txt\n ├── file_11.txt\n ├── file_12.txt\n ├── file_13.txt\n ├── file_14.txt\n ├── file_15.txt\n ├── file_16.txt\n ├── file_17.txt\n ├── file_18.txt\n ├── file_19.txt\n ├── file_20.txt\n └── large_file.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_context.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/file_context/uppercase/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Context Task: Convert Files to Uppercase
"""
import sys
from pathlib import Path
import os
import re
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_uppercase_directory_exists(test_dir: Path) -> bool:
"""Verify that the uppercase directory exists."""
uppercase_dir = test_dir / "uppercase"
if not uppercase_dir.exists():
print("| ❌ Directory 'uppercase' not found")
return False
if not uppercase_dir.is_dir():
print("| ❌ 'uppercase' exists but is not a directory")
return False
print("| ✓ Uppercase directory found")
return True
def verify_uppercase_files_exist(test_dir: Path) -> bool:
"""Verify that all 10 uppercase files exist."""
uppercase_dir = test_dir / "uppercase"
for i in range(1, 11):
filename = f"file_{i:02d}.txt"
file_path = uppercase_dir / filename
if not file_path.exists():
print(f"| ❌ File '{filename}' not found in uppercase directory")
return False
print("| ✓ All 10 uppercase files found")
return True
def verify_uppercase_content(test_dir: Path) -> bool:
"""Verify that uppercase files contain the correct uppercase content."""
uppercase_dir = test_dir / "uppercase"
for i in range(1, 11):
filename = f"file_{i:02d}.txt"
original_file = test_dir / filename
uppercase_file = uppercase_dir / filename
if not original_file.exists():
print(f"| ❌ Original file '{filename}' not found")
return False
try:
original_content = original_file.read_text()
uppercase_content = uppercase_file.read_text()
# Check if uppercase content is the uppercase version of original
expected_uppercase = original_content.upper()
if uppercase_content != expected_uppercase:
print(f"| ❌ File '{filename}' content is not properly converted to uppercase")
return False
except Exception as e:
print(f"| ❌ Error reading file '{filename}': {e}")
return False
print("| ✓ All uppercase files contain correct uppercase content")
return True
def verify_answer_file_exists(test_dir: Path) -> bool:
"""Verify that the answer.txt file exists in the uppercase directory."""
uppercase_dir = test_dir / "uppercase"
answer_file = uppercase_dir / "answer.txt"
if not answer_file.exists():
print("| ❌ File 'answer.txt' not found in uppercase directory")
return False
print("| ✓ Answer file found in uppercase directory")
return True
def verify_answer_format(test_dir: Path) -> bool:
"""Verify that the answer file has the correct format."""
uppercase_dir = test_dir / "uppercase"
answer_file = uppercase_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
if not content:
print("| ❌ Answer file is empty")
return False
lines = content.split('\n')
# Check if we have exactly 10 lines
if len(lines) != 10:
print(f"| ❌ Answer file has {len(lines)} lines, expected 10")
return False
for i, line in enumerate(lines, 1):
line = line.strip()
if not line:
print(f"| ❌ Line {i} is empty")
return False
# Check format: filename:word_count
if ':' not in line:
print(f"| ❌ Line {i} has incorrect format: {line}")
print(" Expected format: filename:word_count")
return False
parts = line.split(':', 1)
if len(parts) != 2:
print(f"| ❌ Line {i} has incorrect format: {line}")
print(" Expected format: filename:word_count")
return False
filename, word_count_str = parts
# Check filename format
if not filename.endswith('.txt') or not filename.startswith('file_'):
print(f"| ❌ Line {i} has invalid filename: {filename}")
return False
# Check word count format (should be integer)
try:
word_count = int(word_count_str)
if word_count <= 0:
print(f"| ❌ Line {i} has invalid word count: {word_count_str}")
return False
except ValueError:
print(f"| ❌ Line {i} has non-integer word count: {word_count_str}")
return False
print("| ✓ Answer format is correct")
return True
except Exception as e:
print(f"| ❌ Error reading answer file: {e}")
return False
def count_words_in_file(file_path: Path) -> int:
"""Count words in a file."""
try:
content = file_path.read_text()
# Split by whitespace and filter out empty strings
words = [word for word in content.split() if word.strip()]
return len(words)
except Exception as e:
print(f"| ❌ Error reading file {file_path}: {e}")
return 0
def verify_word_counts_are_correct(test_dir: Path) -> bool:
"""Verify that the word counts in answer.txt are correct."""
uppercase_dir = test_dir / "uppercase"
answer_file = uppercase_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
lines = content.split('\n')
# Expected word counts based on answer.md
expected_counts = [22, 22, 22, 22, 18, 22, 22, 22, 18, 20]
# Create a set of expected file entries for easier checking
expected_entries = set()
for i in range(1, 11):
filename = f"file_{i:02d}.txt"
expected_count = expected_counts[i - 1]
if i == 6: # Special case for file_06.txt: can be 21 or 22
expected_entries.add(f"{filename}:21")
expected_entries.add(f"{filename}:22")
else:
expected_entries.add(f"{filename}:{expected_count}")
# Check each line in the answer file
found_entries = set()
for line in lines:
line = line.strip()
if line in expected_entries:
found_entries.add(line)
else:
print(f"| ❌ Invalid entry: {line}")
return False
# Check if we found all expected entries
if len(found_entries) != 10:
print(f"| ❌ Found {len(found_entries)} entries, expected 10")
missing = expected_entries - found_entries
if missing:
print(f" Missing entries: {missing}")
return False
print("| ✓ All word counts are correct")
return True
except Exception as e:
print(f"| ❌ Error verifying word counts: {e}")
return False
def verify_all_files_are_included(test_dir: Path) -> bool:
"""Verify that all 10 files are included in the answer."""
uppercase_dir = test_dir / "uppercase"
answer_file = uppercase_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
lines = content.split('\n')
# Check that all 10 files are present
found_files = set()
for line in lines:
parts = line.split(':', 1)
filename = parts[0]
found_files.add(filename)
expected_files = {f"file_{i:02d}.txt" for i in range(1, 11)}
if found_files != expected_files:
missing = expected_files - found_files
extra = found_files - expected_files
if missing:
print(f"| ❌ Missing files in answer: {missing}")
if extra:
print(f"| ❌ Extra files in answer: {extra}")
return False
print("| ✓ All 10 files are included in answer")
return True
except Exception as e:
print(f"| ❌ Error verifying file inclusion: {e}")
return False
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"| 🔍 Verifying Uppercase in: {test_dir}")
print('|')
# Run all verification checks
checks = [
("Uppercase directory exists", verify_uppercase_directory_exists),
("Uppercase files exist", verify_uppercase_files_exist),
("Uppercase content is correct", verify_uppercase_content),
("Answer file exists in uppercase directory", verify_answer_file_exists),
("Answer format is correct", verify_answer_format),
("All files are included", verify_all_files_are_included),
("Word counts are correct", verify_word_counts_are_correct),
]
all_passed = True
for check_name, check_func in checks:
print(f"| Checking {check_name}...")
if not check_func(test_dir):
all_passed = False
print('|')
if all_passed:
print("| 🎉 All verification checks passed!")
sys.exit(0)
else:
print("| ❌ Some verification checks failed!")
sys.exit(1)
except Exception as e:
print(f"| ❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/file_property/size_classification/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
Classify all files in the test directory into three categories based on their file size. Create three subdirectories and move files accordingly.
### Task Objectives
1. **Create three directories** in the test directory:
- `small_files/` - for files smaller than 300 bytes
- `medium_files/` - for files between 300-700 bytes (inclusive)
- `large_files/` - for files larger than 700 bytes
2. **Move all files** from the test directory into the appropriate subdirectory based on their size
3. **Handle all file types** - classify all files regardless of their extension (.txt, .jpg, .MOV, etc.)
================================================
FILE: tasks/filesystem/standard/file_property/size_classification/meta.json
================================================
{
"task_id": "size_classification",
"task_name": "Size Classification",
"category_id": "file_property",
"category_name": "File Property",
"description": "Classify all files in the folder by size into distinct categories (small/medium/large) and generate a comprehensive summary report with statistics.",
"author": "Lingjun Chen",
"created_at": "2025-08-07",
"difficulty": "L3",
"tags": [
"file organization",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_property/\n ├── bear.jpg\n ├── bridge.jpg\n ├── bus.MOV\n ├── random_file_1.txt\n ├── random_file_2.txt\n ├── random_file_3.txt\n ├── road.MOV\n └── sg.jpg",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/file_property/size_classification/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Classification Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def get_expected_classification():
"""Return the expected file classification based on answer.md."""
return {
"small_files": ["random_file_1.txt", "random_file_3.txt"],
"medium_files": ["random_file_2.txt"],
"large_files": ["bear.jpg", "sg.jpg", "road.MOV", "bus.MOV", "bridge.jpg"]
}
def verify_directories_exist(test_dir: Path) -> bool:
"""Verify that all three required directories exist."""
required_dirs = ["small_files", "medium_files", "large_files"]
for dir_name in required_dirs:
dir_path = test_dir / dir_name
if not dir_path.exists():
print(f"❌ Directory '{dir_name}' not found")
return False
if not dir_path.is_dir():
print(f"❌ '{dir_name}' exists but is not a directory")
return False
print("✅ All required directories exist")
return True
def verify_file_classification(test_dir: Path) -> bool:
"""Verify that files are correctly classified into the right directories."""
expected_classification = get_expected_classification()
for dir_name, expected_files in expected_classification.items():
dir_path = test_dir / dir_name
# Check that all expected files are in the directory
missing_files = []
for filename in expected_files:
file_path = dir_path / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing files in '{dir_name}': {missing_files}")
return False
# Check that no unexpected files are in the directory (ignore .DS_Store and similar system files)
actual_files = [f.name for f in dir_path.iterdir() if f.is_file()]
# Filter out system files that are commonly present
system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store']
unexpected_files = [f for f in actual_files if f not in expected_files and f not in system_files]
if unexpected_files:
print(f"❌ Unexpected files in '{dir_name}': {unexpected_files}")
return False
print("✅ All files are correctly classified")
return True
def verify_no_files_in_root(test_dir: Path) -> bool:
"""Verify that no files remain in the root test directory."""
root_files = [f for f in test_dir.iterdir() if f.is_file()]
# Filter out system files that are commonly present
system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store']
non_system_files = [f for f in root_files if f.name not in system_files]
if non_system_files:
print(f"❌ Files still present in root directory: {[f.name for f in non_system_files]}")
return False
print("✅ No files remain in root directory")
return True
def verify_file_sizes(test_dir: Path) -> bool:
"""Verify that files are actually in the correct size categories."""
size_ranges = {
"small_files": (0, 299), # < 300 bytes
"medium_files": (300, 700), # 300-700 bytes (inclusive)
"large_files": (701, float('inf')) # > 700 bytes
}
for dir_name, (min_size, max_size) in size_ranges.items():
dir_path = test_dir / dir_name
for file_path in dir_path.iterdir():
if file_path.is_file():
file_size = file_path.stat().st_size
if dir_name == "small_files" and file_size >= 300:
print(f"❌ File {file_path.name} in small_files but size is {file_size} bytes")
return False
elif dir_name == "medium_files" and (file_size < 300 or file_size > 700):
print(f"❌ File {file_path.name} in medium_files but size is {file_size} bytes")
return False
elif dir_name == "large_files" and file_size <= 700:
print(f"❌ File {file_path.name} in large_files but size is {file_size} bytes")
return False
print("✅ All files are in correct size categories")
return True
def verify_total_file_count(test_dir: Path) -> bool:
"""Verify that all original files are accounted for."""
expected_classification = get_expected_classification()
total_expected = sum(len(files) for files in expected_classification.values())
total_actual = 0
for dir_name in ["small_files", "medium_files", "large_files"]:
dir_path = test_dir / dir_name
if dir_path.exists():
# Count only non-system files
system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store']
files_in_dir = [f for f in dir_path.iterdir() if f.is_file() and f.name not in system_files]
total_actual += len(files_in_dir)
if total_actual != total_expected:
print(f"❌ Expected {total_expected} files total, found {total_actual}")
return False
print(f"✅ Total file count is correct: {total_actual}")
return True
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying file classification in: {test_dir}")
# Run all verification checks
checks = [
("Directory existence", verify_directories_exist),
("File classification", verify_file_classification),
("No files in root", verify_no_files_in_root),
("File size validation", verify_file_sizes),
("Total file count", verify_total_file_count)
]
all_passed = True
for check_name, check_func in checks:
print(f"\n📋 Checking: {check_name}")
if not check_func(test_dir):
all_passed = False
if all_passed:
print("\n🎉 All verification checks passed!")
sys.exit(0)
else:
print("\n❌ Some verification checks failed!")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/file_property/time_classification/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
Analyze the creation time (ctime) of all files in the test directory and organize them into a hierarchical directory structure based on their creation dates.
### Task Objectives
1. **Read metadata** of all files in the test directory
2. **Analyze creation times** (ctime) of all files (excluding .DS_Store)
3. **Create directory structure** organized by month/day based on creation time
4. **Move files** to appropriate directories
5. **Create metadata analysis files** in each directory
### Expected Output
#### Directory Structure
Create directories in the format: `MM/DD/` where:
- MM = month (two digits, e.g., 01, 02)
- DD = day (two digits, e.g., 07, 09, 11, 26)
#### Metadata Analysis Files
Create a file named `metadata_analyse.txt` in each directory containing exactly only two lines:
- **Line 1**: Oldest filename and its creation time (excluding .DS_Store)
- **Line 2**: Latest filename and its creation time (excluding .DS_Store)
================================================
FILE: tasks/filesystem/standard/file_property/time_classification/meta.json
================================================
{
"task_id": "time_classification",
"task_name": "Time Classification",
"category_id": "file_property",
"category_name": "File Property",
"description": "Organize files based on modification timestamps into temporal categories and create a detailed time-based classification report with groupings.",
"author": "Lingjun Chen",
"created_at": "2025-08-07",
"difficulty": "L3",
"tags": [
"file organization",
"data extraction",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "file_property/\n ├── bear.jpg\n ├── bridge.jpg\n ├── bus.MOV\n ├── random_file_1.txt\n ├── random_file_2.txt\n ├── random_file_3.txt\n ├── road.MOV\n └── sg.jpg",
"stateUrl": "https://storage.mcpmark.ai/filesystem/file_property.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/file_property/time_classification/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for File Organization by Creation Time Task
"""
import sys
from pathlib import Path
import os
from datetime import datetime
import re
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def get_month_mapping():
"""Return mapping for both numeric and alphabetic month representations."""
return {
"07": ["07", "7", "jul", "Jul", "JUL"],
"08": ["08", "8", "aug", "Aug", "AUG"]
}
def get_day_mapping():
"""Return mapping for day representations."""
return {
"09": ["09", "9"],
"25": ["25"],
"26": ["26"],
"06": ["06", "6"]
}
def get_expected_directory_structure():
"""Return the expected directory structure based on answer.md."""
return {
"07": {
"09": ["sg.jpg"],
"25": ["bus.MOV"],
"26": ["road.MOV"]
},
"08": {
"06": ["bear.jpg", "bridge.jpg", "random_file_1.txt", "random_file_2.txt", "random_file_3.txt"]
}
}
def find_month_directory(test_dir: Path, expected_month: str) -> Path:
"""Find the actual month directory, handling both numeric and alphabetic representations."""
month_mapping = get_month_mapping()
valid_month_names = month_mapping.get(expected_month, [expected_month])
for month_name in valid_month_names:
month_dir = test_dir / month_name
if month_dir.exists() and month_dir.is_dir():
return month_dir
return None
def find_day_directory(month_dir: Path, expected_day: str) -> Path:
"""Find the actual day directory, handling both numeric representations."""
day_mapping = get_day_mapping()
valid_day_names = day_mapping.get(expected_day, [expected_day])
for day_name in valid_day_names:
day_dir = month_dir / day_name
if day_dir.exists() and day_dir.is_dir():
return day_dir
return None
def verify_directory_structure(test_dir: Path) -> bool:
"""Verify that the correct directory structure exists."""
expected_structure = get_expected_directory_structure()
for expected_month, days in expected_structure.items():
month_dir = find_month_directory(test_dir, expected_month)
if month_dir is None:
valid_names = get_month_mapping().get(expected_month, [expected_month])
print(f"❌ Month directory not found. Expected one of: {valid_names}")
return False
for day, expected_files in days.items():
day_dir = find_day_directory(month_dir, day)
if day_dir is None:
valid_day_names = get_day_mapping().get(day, [day])
print(f"❌ Day directory '{month_dir.name}/{day}' not found. Expected one of: {valid_day_names}")
return False
if not day_dir.is_dir():
print(f"❌ '{month_dir.name}/{day_dir.name}' exists but is not a directory")
return False
print("✅ Directory structure is correct")
return True
def verify_files_in_directories(test_dir: Path) -> bool:
"""Verify that files are in the correct directories."""
expected_structure = get_expected_directory_structure()
for expected_month, days in expected_structure.items():
month_dir = find_month_directory(test_dir, expected_month)
if month_dir is None:
continue # Already handled in verify_directory_structure
for day, expected_files in days.items():
day_dir = find_day_directory(month_dir, day)
if day_dir is None:
continue # Already handled in verify_directory_structure
# Check that all expected files are in the directory
missing_files = []
for filename in expected_files:
file_path = day_dir / filename
if not file_path.exists():
missing_files.append(filename)
if missing_files:
print(f"❌ Missing files in '{month_dir.name}/{day_dir.name}': {missing_files}")
return False
# Check that no unexpected files are in the directory (ignore .DS_Store and metadata_analyse.txt)
actual_files = [f.name for f in day_dir.iterdir() if f.is_file()]
system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store', 'metadata_analyse.txt']
unexpected_files = [f for f in actual_files if f not in expected_files and f not in system_files]
if unexpected_files:
print(f"❌ Unexpected files in '{month_dir.name}/{day_dir.name}': {unexpected_files}")
return False
print("✅ All files are in correct directories")
return True
def verify_metadata_analysis_files(test_dir: Path) -> bool:
"""Verify that metadata_analyse.txt files exist and have correct content."""
expected_structure = get_expected_directory_structure()
for expected_month, days in expected_structure.items():
month_dir = find_month_directory(test_dir, expected_month)
if month_dir is None:
continue # Already handled in verify_directory_structure
for day, expected_files in days.items():
day_dir = find_day_directory(month_dir, day)
if day_dir is None:
continue # Already handled in verify_directory_structure
metadata_file = day_dir / "metadata_analyse.txt"
if not metadata_file.exists():
print(f"❌ metadata_analyse.txt not found in '{month_dir.name}/{day_dir.name}'")
return False
try:
content = metadata_file.read_text().strip()
lines = content.split('\n')
# Check that there are exactly 2 lines
if len(lines) != 2:
print(f"❌ metadata_analyse.txt in '{month_dir.name}/{day_dir.name}' has {len(lines)} lines, expected 2")
return False
# Check each line - more flexible verification
for line_num, line in enumerate(lines, 1):
line_lower = line.lower()
# Check filename based on expected_month and day
expected_filename = None
if expected_month == "07" and day == "09":
expected_filename = "sg.jpg"
elif expected_month == "07" and day == "25":
expected_filename = "bus.mov"
elif expected_month == "07" and day == "26":
expected_filename = "road.mov"
elif expected_month == "08" and day == "06":
# For 08/06, check if it's one of the expected files
if line_num == 1: # First line should be bear.jpg
expected_filename = "bear.jpg"
else: # Second line should be one of the random files
expected_filenames = ["random_file_1.txt", "random_file_2.txt", "random_file_3.txt"]
if not any(filename in line_lower for filename in expected_filenames):
print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain one of {expected_filenames}: {line}")
return False
continue # Skip other checks for this line
if expected_filename and expected_filename not in line_lower:
print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain '{expected_filename}': {line}")
return False
# Check month letters
month_letters = None
if expected_month == "07":
month_letters = ["jul", "7"]
elif expected_month == "08":
month_letters = ["aug", "8"]
if month_letters and not any(letter in line_lower for letter in month_letters):
print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain month letters: {line}")
return False
# Check year (2025)
if "2025" not in line_lower:
print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain '2025': {line}")
return False
# Check day number - support both formats
valid_day_names = get_day_mapping().get(day, [day])
if not any(day_name in line_lower for day_name in valid_day_names):
print(f"❌ Line {line_num} in '{month_dir.name}/{day_dir.name}' should contain day '{day}' (or {valid_day_names}): {line}")
return False
except Exception as e:
print(f"❌ Error reading metadata_analyse.txt in '{month_dir.name}/{day_dir.name}': {e}")
return False
print("✅ All metadata_analyse.txt files are correct")
return True
def verify_no_files_in_root(test_dir: Path) -> bool:
"""Verify that no files remain in the root test directory."""
root_files = [f for f in test_dir.iterdir() if f.is_file()]
# Filter out system files that are commonly present
system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store']
non_system_files = [f for f in root_files if f.name not in system_files]
if non_system_files:
print(f"❌ Files still present in root directory: {[f.name for f in non_system_files]}")
return False
print("✅ No files remain in root directory")
return True
def verify_total_file_count(test_dir: Path) -> bool:
"""Verify that all original files are accounted for."""
expected_structure = get_expected_directory_structure()
total_expected = sum(len(files) for days in expected_structure.values() for files in days.values())
total_actual = 0
for expected_month, days in expected_structure.items():
month_dir = find_month_directory(test_dir, expected_month)
if month_dir is None:
continue
for day in days:
day_dir = find_day_directory(month_dir, day)
if day_dir and day_dir.exists():
# Count only non-system files
system_files = ['.DS_Store', 'Thumbs.db', '.DS_Store?', '._.DS_Store', 'metadata_analyse.txt']
files_in_dir = [f for f in day_dir.iterdir() if f.is_file() and f.name not in system_files]
total_actual += len(files_in_dir)
if total_actual != total_expected:
print(f"❌ Expected {total_expected} files total, found {total_actual}")
return False
print(f"✅ Total file count is correct: {total_actual}")
return True
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying Time Classification in: {test_dir}")
# Run all verification checks
checks = [
("Directory structure", verify_directory_structure),
("Files in directories", verify_files_in_directories),
("Metadata analysis files", verify_metadata_analysis_files),
("No files in root", verify_no_files_in_root),
("Total file count", verify_total_file_count)
]
all_passed = True
for check_name, check_func in checks:
print(f"\n📋 Checking: {check_name}")
if not check_func(test_dir):
all_passed = False
if all_passed:
print("\n🎉 All verification checks passed!")
sys.exit(0)
else:
print("\n❌ Some verification checks failed!")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/folder_structure/structure_analysis/description.md
================================================
Please use FileSystem tools to finish the following task:
You need to recursively traverse the entire folder structure under the main directory and generate a detailed statistical report in a file named `structure_analysis.txt`.
In all tasks, ignore `.DS_Store` files.
In any tasks, you should not change or delete any existed files.
Do not try to use python code.
---
### 1. File Statistics
Count the following information for the entire directory structure:
- total number of files
- total number of folders
- total size of the hole folder (in bytes, include .DS_Store only in this subtask)
**Format (one item per line):**
total number of files: X
total number of folders: Y
total size of all files: Z
---
### 2. Depth Analysis
Identify the deepest folder path(s) in the directory and calculate its depth level.
- Use relative paths based on main directory.
- **Write the folder path only up to the folder, not including the file name.For example, if the file path is `./complex_structure/A/B/C/def.txt`, then the path in your report should be `complex_structure/A/B/C`, and the depth is `4`.**
- If multiple deepest paths exist, list only one.
**Format (one item per line):**
depth: N
PATH
---
### 3. File Type Classification
Categorize files by their extensions and count the number of files for each type.
Files without extensions should also be included.
**Format (one extension per line):**
txt: count
py: count
jpg: count
mov: count
(no extension): count
================================================
FILE: tasks/filesystem/standard/folder_structure/structure_analysis/meta.json
================================================
{
"task_id": "structure_analysis",
"task_name": "Structure Analysis",
"category_id": "folder_structure",
"category_name": "Folder Structure",
"description": "Perform thorough analysis of complex folder hierarchy to generate a detailed structural summary report with comprehensive file statistics.",
"author": "Lingjun Chen",
"created_at": "2025-08-16",
"difficulty": "L3",
"tags": [
"pattern analysis",
"data extraction"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "folder_structure/\n └── complex_structure/\n ├── deeply/\n │ └── nested/\n │ └── folder/\n │ └── structure/\n ├── empty_folder/\n ├── folder_lxkHt_0_1/\n │ └── file_PeLzC_0.txt\n ├── folder_QdTAj_0_2/\n │ ├── folder_eXccj_1_0/\n │ │ ├── folder_Mqlwh_2_1/\n │ │ │ ├── folder_cKxcP_3_3/\n │ │ │ │ ├── folder_BPTMK_4_1/\n │ │ │ │ │ └── file_RHtBP_0.txt\n │ │ │ │ ├── folder_QNqjq_4_0/\n │ │ │ │ │ ├── folder_gRwPE_5_1/\n │ │ │ │ │ │ ├── file_jVlpp_0.txt\n │ │ │ │ │ │ └── file_vJuHz_1.txt\n │ │ │ │ │ ├── folder_XdXYJ_5_0/\n │ │ │ │ │ │ └── file_KvkKi_0.txt\n │ │ │ │ │ ├── file_gGxLG_2.txt\n │ │ │ │ │ ├── file_Hzkxo_0.txt\n │ │ │ │ │ └── file_XRjeh_1.txt\n │ │ │ │ ├── folder_vIBIt_4_2/\n │ │ │ │ │ ├── folder_kRDNS_5_0/\n │ │ │ │ │ │ └── file_wFSjJ_0.txt\n │ │ │ │ │ └── file_NyBSO_0.txt\n │ │ │ │ ├── file_EOCNf_1.txt\n │ │ │ │ └── file_gmrXA_0.txt\n │ │ │ ├── folder_NcruA_3_1/\n │ │ │ │ ├── file_bLWDj_1.txt\n │ │ │ │ └── file_WAftR_0.txt\n │ │ │ ├── folder_qCDFI_3_2/\n │ │ │ │ ├── file_eSMOJ_0.txt\n │ │ │ │ ├── file_oxADy_2.txt\n │ │ │ │ └── file_RTbbc_1.txt\n │ │ │ ├── folder_QVHUU_3_0/\n │ │ │ │ ├── folder_FEPTK_4_1/\n │ │ │ │ │ ├── folder_GHoMC_5_1/\n │ │ │ │ │ │ └── file_rAMYd_0.txt\n │ │ │ │ │ ├── folder_iBDUY_5_0/\n │ │ │ │ │ │ └── file_IJCaw_0.txt\n │ │ │ │ │ ├── folder_VRXgp_5_2/\n │ │ │ │ │ │ └── file_hkUmS_0.txt\n │ │ │ │ │ ├── file_nqLAf_1.txt\n │ │ │ │ │ └── file_XflmA_0.txt\n │ │ │ │ ├── folder_FlPoK_4_3/\n │ │ │ │ │ ├── folder_hSVNm_5_3/\n │ │ │ │ │ │ └── file_klnbn_0.txt\n │ │ │ │ │ ├── folder_iZuEl_5_0/\n │ │ │ │ │ │ └── file_LqAmy_0.txt\n │ │ │ │ │ ├── folder_LcURj_5_2/\n │ │ │ │ │ │ ├── file_RgwOS_1.txt\n │ │ │ │ │ │ └── file_ZHnYb_0.txt\n │ │ │ │ │ ├── folder_tuZQJ_5_1/\n │ │ │ │ │ │ └── file_LHuIx_0.txt\n │ │ │ │ │ ├── file_asJnB_1.txt\n │ │ │ │ │ └── file_EzLdu_0.txt\n │ │ │ │ ├── folder_ndhsJ_4_0/\n │ │ │ │ │ ├── folder_CUSXK_5_0/\n │ │ │ │ │ │ ├── file_DpiuM_1.txt\n │ │ │ │ │ │ └── file_pSqeG_0.txt\n │ │ │ │ │ ├── folder_pstmE_5_1/\n │ │ │ │ │ │ └── file_YwdJt_0.txt\n │ │ │ │ │ ├── folder_StlsP_5_2/\n │ │ │ │ │ │ ├── file_kriBJ_0.txt\n │ │ │ │ │ │ └── file_XCEdm_1.txt\n │ │ │ │ │ ├── file_ToDjh_1.txt\n │ │ │ │ │ └── file_xbIVx_0.txt\n │ │ │ │ ├── folder_PJBok_4_4/\n │ │ │ │ │ ├── folder_mzxaf_5_0/\n │ │ │ │ │ │ ├── file_ILBzj_2.txt\n │ │ │ │ │ │ ├── file_MTGMm_1.txt\n │ │ │ │ │ │ └── file_zBDqz_0.txt\n │ │ │ │ │ ├── folder_sULMj_5_1/\n │ │ │ │ │ │ ├── file_BHziw_1.txt\n │ │ │ │ │ │ ├── file_sIjiu_2.txt\n │ │ │ │ │ │ └── file_VqNkB_0.txt\n │ │ │ │ │ ├── folder_vypSi_5_3/\n │ │ │ │ │ │ ├── file_kZbIm_1.txt\n │ │ │ │ │ │ └── file_sOBtE_0.txt\n │ │ │ │ │ ├── folder_ZLGHy_5_2/\n │ │ │ │ │ │ ├── file_azaFF_0.txt\n │ │ │ │ │ │ └── file_nAFRe_1.txt\n │ │ │ │ │ ├── file_mIkQU_0.txt\n │ │ │ │ │ └── file_sGPxd_1.txt\n │ │ │ │ ├── folder_VTbEG_4_2/\n │ │ │ │ │ ├── file_HtYLg_0.txt\n │ │ │ │ │ ├── file_JXjMd_1.txt\n │ │ │ │ │ └── file_tPccB_2.txt\n │ │ │ │ ├── file_BuOSw_1.txt\n │ │ │ │ └── file_TpoqE_0.txt\n │ │ │ ├── folder_wTvun_3_4/\n │ │ │ │ ├── file_GyhyE_1.txt\n │ │ │ │ ├── file_POsla_2.txt\n │ │ │ │ └── file_tSsvk_0.txt\n │ │ │ ├── file_irNju_0.txt\n │ │ │ └── file_jYBRm_1.txt\n │ │ ├── folder_YlJLI_2_0/\n │ │ │ └── file_FpFSL_0.txt\n │ │ ├── file_cFgBr_2.txt\n │ │ ├── file_lKEWN_1.txt\n │ │ └── file_ZEWFP_0.txt\n │ └── file_ayUCH_0.txt\n ├── folder_xtgyi_0_0/\n │ └── file_BvSOB_0.txt\n ├── mixed_content/\n │ └── images_and_text/\n │ └── notes.txt\n ├── project/\n │ ├── docs/\n │ │ └── archive/\n │ │ └── 2023/\n │ │ └── reports/\n │ │ ├── report_0.txt\n │ │ ├── report_1.txt\n │ │ └── report_2.txt\n │ └── src/\n │ └── main/\n │ └── resources/\n └── m.py",
"stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/folder_structure/structure_analysis/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Directory Structure Analysis Task
"""
import sys
from pathlib import Path
import os
import re
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_structure_analysis_file_exists(test_dir: Path) -> bool:
"""Verify that the structure_analysis.txt file exists."""
analysis_file = test_dir / "structure_analysis.txt"
if not analysis_file.exists():
print("❌ File 'structure_analysis.txt' not found")
return False
print("✅ structure_analysis.txt file found")
return True
def verify_structure_analysis_file_readable(test_dir: Path) -> bool:
"""Verify that the structure_analysis.txt file is readable."""
analysis_file = test_dir / "structure_analysis.txt"
try:
content = analysis_file.read_text()
if not content.strip():
print("❌ structure_analysis.txt file is empty")
return False
print("✅ structure_analysis.txt file is readable")
return True
except Exception as e:
print(f"❌ Error reading structure_analysis.txt file: {e}")
return False
def verify_subtask1_file_statistics(test_dir: Path) -> bool:
"""Verify subtask 1: File Statistics - files must be 69, folders must be 51, 58097 allows +-1000."""
analysis_file = test_dir / "structure_analysis.txt"
try:
content = analysis_file.read_text()
# Extract numbers from the content
file_count_match = re.search(r'total number of files:\s*(\d+)', content)
folder_count_match = re.search(r'total number of folders:\s*(\d+)', content)
size_match = re.search(r'total size of all files:\s*(\d+)', content)
if not file_count_match or not folder_count_match or not size_match:
print("❌ Could not extract file statistics from structure_analysis.txt")
return False
file_count = int(file_count_match.group(1))
folder_count = int(folder_count_match.group(1))
total_size = int(size_match.group(1))
print(f"📊 Found: files={file_count}, folders={folder_count}, size={total_size}")
# Check if file count is exactly 69
if file_count != 69:
print(f"❌ File count must be 69, found: {file_count}")
return False
# Check if folder count is exactly 51
if folder_count != 51:
print(f"❌ Folder count must be 51, found: {folder_count}")
return False
# Check if size is within acceptable range (58097 ± 1000)
expected_size = 58097
size_tolerance = 1000
if abs(total_size - expected_size) > size_tolerance:
print(f"❌ Total size ({total_size}) is not within acceptable range ({expected_size} ± {size_tolerance})")
return False
print(f"✅ File statistics verified: files={file_count}, folders={folder_count}, size={total_size} (within tolerance)")
return True
except Exception as e:
print(f"❌ Error verifying file statistics: {e}")
return False
def verify_subtask2_depth_analysis(test_dir: Path) -> bool:
"""Verify subtask 2: Depth Analysis - depth must be 7, verify path exists."""
analysis_file = test_dir / "structure_analysis.txt"
try:
content = analysis_file.read_text()
# Extract depth and path
depth_match = re.search(r'depth:\s*(\d+)', content)
path_match = re.search(r'^([^\n]+)$', content, re.MULTILINE)
if not depth_match:
print("❌ Could not extract depth from structure_analysis.txt")
return False
depth = int(depth_match.group(1))
# Check if depth is exactly 7
if depth != 7:
print(f"❌ Depth must be 7, found: {depth}")
return False
print(f"✅ Depth verified: {depth}")
# Extract the path (it should be on a separate line after "depth: 7")
lines = content.split('\n')
path_line = None
for i, line in enumerate(lines):
if line.strip() == f"depth: {depth}":
if i + 1 < len(lines):
path_line = lines[i + 1].strip()
break
if not path_line:
print("❌ Could not find path line after depth specification")
return False
print(f"📁 Found path: {path_line}")
# Verify that the path depth matches the declared depth
path_parts = path_line.split('/')
actual_depth = len(path_parts)
if actual_depth != depth:
print(f"❌ Path depth mismatch: declared depth is {depth}, but path has {actual_depth} levels")
print(f" Path: {path_line}")
print(f" Path parts: {path_parts}")
return False
print(f"✅ Path depth verified: {actual_depth} levels")
# Verify that this path exists in the test environment
expected_path = test_dir / path_line
if not expected_path.exists():
print(f"❌ Path does not exist: {expected_path}")
return False
if not expected_path.is_dir():
print(f"❌ Path exists but is not a directory: {expected_path}")
return False
print(f"✅ Path verified and exists: {path_line}")
return True
except Exception as e:
print(f"❌ Error verifying depth analysis: {e}")
return False
def verify_subtask3_file_type_classification(test_dir: Path) -> bool:
"""Verify subtask 3: File Type Classification - 68 and 1 must be accurate."""
analysis_file = test_dir / "structure_analysis.txt"
try:
content = analysis_file.read_text()
# Extract file type counts
txt_match = re.search(r'txt:\s*(\d+)', content)
py_match = re.search(r'py:\s*(\d+)', content)
if not txt_match or not py_match:
print("❌ Could not extract file type counts from structure_analysis.txt")
return False
txt_count = int(txt_match.group(1))
py_count = int(py_match.group(1))
print(f"📁 Found: txt={txt_count}, py={py_count}")
# Check if txt count is exactly 68
if txt_count != 68:
print(f"❌ txt count must be 68, found: {txt_count}")
return False
# Check if py count is exactly 1
if py_count != 1:
print(f"❌ py count must be 1, found: {py_count}")
return False
print(f"✅ File type classification verified: txt={txt_count}, py={py_count}")
return True
except Exception as e:
print(f"❌ Error verifying file type classification: {e}")
return False
def verify_file_format(test_dir: Path) -> bool:
"""Verify that the structure_analysis.txt file has proper format."""
analysis_file = test_dir / "structure_analysis.txt"
try:
content = analysis_file.read_text()
lines = content.split('\n')
# Check if file has the expected structure
if len(lines) < 5: # Should have at least 5 lines
print("❌ File seems too short to contain all required information")
return False
# Basic format check - ensure it's not completely corrupted
if not content.strip():
print("❌ File is completely empty")
return False
print("✅ File format is acceptable")
return True
except Exception as e:
print(f"❌ Error checking file format: {e}")
return False
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying Directory Structure Analysis Task in: {test_dir}")
# Define verification steps
verification_steps = [
("Structure Analysis File Exists", verify_structure_analysis_file_exists),
("File is Readable", verify_structure_analysis_file_readable),
("Subtask 1: File Statistics", verify_subtask1_file_statistics),
("Subtask 2: Depth Analysis", verify_subtask2_depth_analysis),
("Subtask 3: File Type Classification", verify_subtask3_file_type_classification),
("File Format", verify_file_format),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Directory Structure Analysis completed correctly!")
print("🎉 Structure Analysis verification: PASS")
sys.exit(0)
else:
print("❌ Structure Analysis verification: FAIL")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/folder_structure/structure_mirror/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task
Copy the entire directory structure of `complex_structure/` to `complex_structure_mirror/` without copying any file contents. Do not use python code.
### Requirements
- Create the entire directory structure in `complex_structure_mirror/`
- Do not copy any file contents, only create directories
- In each empty directory, create a `placeholder.txt` file containing the absolute path of that directory
- Handle nested directories of any depth
- You should also follow 2 rules:
1. **Discard any directory that directly contains more than 2 files (only count the immediate folder).**
2. **If a directory name contains numbers, append "_processed" to the mirror directory name**
================================================
FILE: tasks/filesystem/standard/folder_structure/structure_mirror/meta.json
================================================
{
"task_id": "structure_mirror",
"task_name": "Structure Mirror",
"category_id": "folder_structure",
"category_name": "Folder Structure",
"description": "Create an exact mirror copy of the folder structure in a target location while applying specified transformation rules.",
"author": "Lingjun Chen",
"created_at": "2025-08-08",
"difficulty": "L3",
"tags": [
"file organization",
"content transformation"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "folder_structure/\n └── complex_structure/\n ├── deeply/\n │ └── nested/\n │ └── folder/\n │ └── structure/\n ├── empty_folder/\n ├── folder_lxkHt_0_1/\n │ └── file_PeLzC_0.txt\n ├── folder_QdTAj_0_2/\n │ ├── folder_eXccj_1_0/\n │ │ ├── folder_Mqlwh_2_1/\n │ │ │ ├── folder_cKxcP_3_3/\n │ │ │ │ ├── folder_BPTMK_4_1/\n │ │ │ │ │ └── file_RHtBP_0.txt\n │ │ │ │ ├── folder_QNqjq_4_0/\n │ │ │ │ │ ├── folder_gRwPE_5_1/\n │ │ │ │ │ │ ├── file_jVlpp_0.txt\n │ │ │ │ │ │ └── file_vJuHz_1.txt\n │ │ │ │ │ ├── folder_XdXYJ_5_0/\n │ │ │ │ │ │ └── file_KvkKi_0.txt\n │ │ │ │ │ ├── file_gGxLG_2.txt\n │ │ │ │ │ ├── file_Hzkxo_0.txt\n │ │ │ │ │ └── file_XRjeh_1.txt\n │ │ │ │ ├── folder_vIBIt_4_2/\n │ │ │ │ │ ├── folder_kRDNS_5_0/\n │ │ │ │ │ │ └── file_wFSjJ_0.txt\n │ │ │ │ │ └── file_NyBSO_0.txt\n │ │ │ │ ├── file_EOCNf_1.txt\n │ │ │ │ └── file_gmrXA_0.txt\n │ │ │ ├── folder_NcruA_3_1/\n │ │ │ │ ├── file_bLWDj_1.txt\n │ │ │ │ └── file_WAftR_0.txt\n │ │ │ ├── folder_qCDFI_3_2/\n │ │ │ │ ├── file_eSMOJ_0.txt\n │ │ │ │ ├── file_oxADy_2.txt\n │ │ │ │ └── file_RTbbc_1.txt\n │ │ │ ├── folder_QVHUU_3_0/\n │ │ │ │ ├── folder_FEPTK_4_1/\n │ │ │ │ │ ├── folder_GHoMC_5_1/\n │ │ │ │ │ │ └── file_rAMYd_0.txt\n │ │ │ │ │ ├── folder_iBDUY_5_0/\n │ │ │ │ │ │ └── file_IJCaw_0.txt\n │ │ │ │ │ ├── folder_VRXgp_5_2/\n │ │ │ │ │ │ └── file_hkUmS_0.txt\n │ │ │ │ │ ├── file_nqLAf_1.txt\n │ │ │ │ │ └── file_XflmA_0.txt\n │ │ │ │ ├── folder_FlPoK_4_3/\n │ │ │ │ │ ├── folder_hSVNm_5_3/\n │ │ │ │ │ │ └── file_klnbn_0.txt\n │ │ │ │ │ ├── folder_iZuEl_5_0/\n │ │ │ │ │ │ └── file_LqAmy_0.txt\n │ │ │ │ │ ├── folder_LcURj_5_2/\n │ │ │ │ │ │ ├── file_RgwOS_1.txt\n │ │ │ │ │ │ └── file_ZHnYb_0.txt\n │ │ │ │ │ ├── folder_tuZQJ_5_1/\n │ │ │ │ │ │ └── file_LHuIx_0.txt\n │ │ │ │ │ ├── file_asJnB_1.txt\n │ │ │ │ │ └── file_EzLdu_0.txt\n │ │ │ │ ├── folder_ndhsJ_4_0/\n │ │ │ │ │ ├── folder_CUSXK_5_0/\n │ │ │ │ │ │ ├── file_DpiuM_1.txt\n │ │ │ │ │ │ └── file_pSqeG_0.txt\n │ │ │ │ │ ├── folder_pstmE_5_1/\n │ │ │ │ │ │ └── file_YwdJt_0.txt\n │ │ │ │ │ ├── folder_StlsP_5_2/\n │ │ │ │ │ │ ├── file_kriBJ_0.txt\n │ │ │ │ │ │ └── file_XCEdm_1.txt\n │ │ │ │ │ ├── file_ToDjh_1.txt\n │ │ │ │ │ └── file_xbIVx_0.txt\n │ │ │ │ ├── folder_PJBok_4_4/\n │ │ │ │ │ ├── folder_mzxaf_5_0/\n │ │ │ │ │ │ ├── file_ILBzj_2.txt\n │ │ │ │ │ │ ├── file_MTGMm_1.txt\n │ │ │ │ │ │ └── file_zBDqz_0.txt\n │ │ │ │ │ ├── folder_sULMj_5_1/\n │ │ │ │ │ │ ├── file_BHziw_1.txt\n │ │ │ │ │ │ ├── file_sIjiu_2.txt\n │ │ │ │ │ │ └── file_VqNkB_0.txt\n │ │ │ │ │ ├── folder_vypSi_5_3/\n │ │ │ │ │ │ ├── file_kZbIm_1.txt\n │ │ │ │ │ │ └── file_sOBtE_0.txt\n │ │ │ │ │ ├── folder_ZLGHy_5_2/\n │ │ │ │ │ │ ├── file_azaFF_0.txt\n │ │ │ │ │ │ └── file_nAFRe_1.txt\n │ │ │ │ │ ├── file_mIkQU_0.txt\n │ │ │ │ │ └── file_sGPxd_1.txt\n │ │ │ │ ├── folder_VTbEG_4_2/\n │ │ │ │ │ ├── file_HtYLg_0.txt\n │ │ │ │ │ ├── file_JXjMd_1.txt\n │ │ │ │ │ └── file_tPccB_2.txt\n │ │ │ │ ├── file_BuOSw_1.txt\n │ │ │ │ └── file_TpoqE_0.txt\n │ │ │ ├── folder_wTvun_3_4/\n │ │ │ │ ├── file_GyhyE_1.txt\n │ │ │ │ ├── file_POsla_2.txt\n │ │ │ │ └── file_tSsvk_0.txt\n │ │ │ ├── file_irNju_0.txt\n │ │ │ └── file_jYBRm_1.txt\n │ │ ├── folder_YlJLI_2_0/\n │ │ │ └── file_FpFSL_0.txt\n │ │ ├── file_cFgBr_2.txt\n │ │ ├── file_lKEWN_1.txt\n │ │ └── file_ZEWFP_0.txt\n │ └── file_ayUCH_0.txt\n ├── folder_xtgyi_0_0/\n │ └── file_BvSOB_0.txt\n ├── mixed_content/\n │ └── images_and_text/\n │ └── notes.txt\n ├── project/\n │ ├── docs/\n │ │ └── archive/\n │ │ └── 2023/\n │ │ └── reports/\n │ │ ├── report_0.txt\n │ │ ├── report_1.txt\n │ │ └── report_2.txt\n │ └── src/\n │ └── main/\n │ └── resources/\n └── m.py",
"stateUrl": "https://storage.mcpmark.ai/filesystem/folder_structure.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/folder_structure/structure_mirror/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Directory Structure Mirroring with Smart Placeholders Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_mirror_directory_exists(test_dir: Path, mirror_path: Path) -> bool:
"""Verify that a mirror directory exists."""
if not mirror_path.exists():
print(f"❌ Mirror directory not found: {mirror_path.relative_to(test_dir)}")
return False
if not mirror_path.is_dir():
print(f"❌ Mirror path exists but is not a directory: {mirror_path.relative_to(test_dir)}")
return False
print(f"✅ Mirror directory exists: {mirror_path.relative_to(test_dir)}")
return True
def verify_placeholder_file_exists(mirror_path: Path, test_dir: Path) -> bool:
"""Verify that placeholder.txt exists in the mirror directory."""
placeholder_file = mirror_path / "placeholder.txt"
if not placeholder_file.exists():
print(f"❌ placeholder.txt not found in: {mirror_path.relative_to(test_dir)}")
return False
if not placeholder_file.is_file():
print(f"❌ placeholder.txt exists but is not a file in: {mirror_path.relative_to(test_dir)}")
return False
print(f"✅ placeholder.txt exists in: {mirror_path.relative_to(test_dir)}")
return True
def verify_placeholder_content(mirror_path: Path, test_dir: Path) -> bool:
"""Verify that placeholder.txt contains the correct path ending with complex_structure_mirror/..."""
placeholder_file = mirror_path / "placeholder.txt"
try:
content = placeholder_file.read_text().strip()
# Check if content is not empty
if not content:
print(f"❌ placeholder.txt is empty in: {mirror_path.relative_to(test_dir)}")
return False
# Check if it contains the correct path ending with complex_structure_mirror/...
expected_ending = f"complex_structure_mirror/{mirror_path.relative_to(test_dir / 'complex_structure_mirror')}"
if not content.endswith(expected_ending):
print(f"❌ placeholder.txt content incorrect in: {mirror_path.relative_to(test_dir)}")
print(f" Expected ending: {expected_ending}")
print(f" Found: {content}")
return False
print(f"✅ placeholder.txt content is correct in: {mirror_path.relative_to(test_dir)}")
return True
except Exception as e:
print(f"❌ Error reading placeholder.txt in {mirror_path.relative_to(test_dir)}: {e}")
return False
def verify_no_files_copied(test_dir: Path) -> bool:
"""Verify that no file contents were copied, only directory structure."""
source_dir = test_dir / "complex_structure"
mirror_dir = test_dir / "complex_structure_mirror"
if not mirror_dir.exists():
print("❌ Mirror directory 'complex_structure_mirror' not found")
return False
# Check that no files from source were copied (except placeholder.txt files)
for source_file in source_dir.rglob("*"):
if source_file.is_file():
# Calculate the corresponding mirror path
relative_path = source_file.relative_to(source_dir)
mirror_file = mirror_dir / relative_path
# Skip if this would be a placeholder.txt file
if mirror_file.name == "placeholder.txt":
continue
if mirror_file.exists():
print(f"❌ File was copied when it shouldn't be: {relative_path}")
return False
print("✅ No file contents were copied, only directory structure")
return True
def verify_mirror_structure_completeness(test_dir: Path) -> bool:
"""Verify that the mirror structure is complete and matches expected structure."""
mirror_dir = test_dir / "complex_structure_mirror"
if not mirror_dir.exists():
print("❌ Mirror directory 'complex_structure_mirror' not found")
return False
# Define expected directories that should exist (based on backup structure)
expected_dirs = [
"deeply",
"deeply/nested",
"deeply/nested/folder",
"deeply/nested/folder/structure",
"empty_folder",
"folder_lxkHt_0_1_processed",
"folder_QdTAj_0_2_processed",
"folder_xtgyi_0_0_processed",
"mixed_content",
"mixed_content/images_and_text",
"project",
"project/docs",
"project/docs/archive",
"project/docs/archive/2023_processed",
"project/src",
"project/src/main",
"project/src/main/resources"
]
# Define which directories should have placeholder.txt files
placeholder_dirs = [
"deeply/nested/folder/structure",
"empty_folder",
"folder_lxkHt_0_1_processed",
"folder_QdTAj_0_2_processed",
"folder_xtgyi_0_0_processed",
"mixed_content/images_and_text",
"project/docs/archive/2023_processed",
"project/src/main/resources"
]
all_passed = True
# Check that all expected directories exist
for expected_dir in expected_dirs:
mirror_path = mirror_dir / expected_dir
if not verify_mirror_directory_exists(test_dir, mirror_path):
all_passed = False
elif expected_dir in placeholder_dirs:
# Check placeholder.txt for directories that should have it
if not verify_placeholder_file_exists(mirror_path, test_dir):
all_passed = False
elif not verify_placeholder_content(mirror_path, test_dir):
all_passed = False
# Check that no unexpected directories exist
for mirror_subdir in mirror_dir.rglob("*"):
if mirror_subdir.is_dir():
relative_path = mirror_subdir.relative_to(mirror_dir)
if str(relative_path) not in expected_dirs and str(relative_path) != ".":
print(f"❌ Unexpected directory found: {relative_path}")
all_passed = False
return all_passed
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying Directory Structure Mirroring with Smart Placeholders in: {test_dir}")
# Define verification steps
verification_steps = [
("No files copied", verify_no_files_copied),
("Mirror structure completeness", verify_mirror_structure_completeness),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n📋 Checking: {step_name}")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Directory structure mirroring completed correctly!")
print("🎉 Structure Mirror verification: PASS")
sys.exit(0)
else:
print("❌ Structure Mirror verification: FAIL")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/legal_document/dispute_review/description.md
================================================
Please use FileSystem tools to finish the following task:
**Overview**
The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project.
There are comments in it, come from four people:
- **Bill Harvey** (Company CEO)
- **Michelle Jackson** (Investor)
- **David Russel** (Company Counsel)
- **Tony Taylor** (Investor Counsel)
Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where:
- `name` is the commenter's name
- `content` is the revision note
**Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count.
## Task
Your task is to review these versions and identify all clauses that have been commented in **v5,6,7 (in folder legal_files/)**. Generate a file named `dispute_review.txt` in the main directory. In this file, list each commented clause on a separate line and indicate the number of comments for each clause in the format "Clause number:number of comments". Clause number should be in the format of X.X.
================================================
FILE: tasks/filesystem/standard/legal_document/dispute_review/meta.json
================================================
{
"task_id": "dispute_review",
"task_name": "Dispute Review",
"category_id": "legal_document",
"category_name": "Legal Document",
"description": "Analyze multiple versions of legal documents to track clause discussion frequency and generate a comprehensive dispute summary report.",
"author": "Lingjun Chen",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"data extraction",
"cross-referencing",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "legal_document/\n └── legal_files/\n ├── Preferred_Stock_Purchase_Agreement_v0.txt\n ├── Preferred_Stock_Purchase_Agreement_v1.txt\n ├── Preferred_Stock_Purchase_Agreement_v2.txt\n ├── Preferred_Stock_Purchase_Agreement_v3.txt\n ├── Preferred_Stock_Purchase_Agreement_v4.txt\n ├── Preferred_Stock_Purchase_Agreement_v5.txt\n ├── Preferred_Stock_Purchase_Agreement_v6.txt\n ├── Preferred_Stock_Purchase_Agreement_v7.txt\n ├── Preferred_Stock_Purchase_Agreement_v8.txt\n ├── Preferred_Stock_Purchase_Agreement_v9.txt\n └── Preferred_Stock_Purchase_Agreement_v10.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip",
"stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents"
}
}
================================================
FILE: tasks/filesystem/standard/legal_document/dispute_review/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Legal Document Dispute Review Task
"""
import sys
from pathlib import Path
import re
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_output_file_exists(test_dir: Path) -> bool:
"""Verify that the dispute_review.txt file exists."""
output_file = test_dir / "dispute_review.txt"
if not output_file.exists():
print("❌ File 'dispute_review.txt' not found")
return False
print("✅ Output file found")
return True
def verify_output_format(test_dir: Path) -> bool:
"""Verify that the output file has the correct format."""
output_file = test_dir / "dispute_review.txt"
try:
content = output_file.read_text().strip()
# Check if content is not empty
if not content:
print("❌ Output file is empty")
return False
# Check format: each line should be "X.X:number"
lines = content.split('\n')
for i, line in enumerate(lines, 1):
line = line.strip()
if not line:
continue
# Check format: X.X:number
if not re.match(r'^\d+\.\d+:\d+$', line):
print(f"❌ Line {i} has incorrect format: '{line}'")
print(" Expected format: 'X.X:number' (e.g., '1.1:3')")
return False
print("✅ Output format is correct")
return True
except Exception as e:
print(f"❌ Error reading output file: {e}")
return False
def verify_expected_entries(test_dir: Path) -> bool:
"""Verify that the output contains the expected entries with correct counts."""
output_file = test_dir / "dispute_review.txt"
try:
content = output_file.read_text().strip()
lines = content.split('\n')
# Parse the output into a dictionary
output_entries = {}
for line in lines:
line = line.strip()
if not line:
continue
clause, count_str = line.split(':', 1)
output_entries[clause] = int(count_str)
# Expected entries based on answer.txt
expected_entries = {
"1.1": 3,
"1.3": 3,
"4.6": [5, 6], # Can be either 5 or 6
"4.16": 5,
"6.8": 4
}
# Check if all expected entries are present
missing_entries = []
for clause in expected_entries:
if clause not in output_entries:
missing_entries.append(clause)
if missing_entries:
print(f"❌ Missing expected entries: {missing_entries}")
return False
# Check if there are extra entries
extra_entries = []
for clause in output_entries:
if clause not in expected_entries:
extra_entries.append(clause)
if extra_entries:
print(f"❌ Unexpected extra entries: {extra_entries}")
return False
# Check counts for each entry
for clause, expected_count in expected_entries.items():
actual_count = output_entries[clause]
if isinstance(expected_count, list):
# For 4.6, accept either 5 or 6
if actual_count not in expected_count:
print(f"❌ Clause {clause}: expected {expected_count}, got {actual_count}")
return False
else:
if actual_count != expected_count:
print(f"❌ Clause {clause}: expected {expected_count}, got {actual_count}")
return False
print("✅ All expected entries with correct counts")
return True
except Exception as e:
print(f"❌ Error verifying entries: {e}")
return False
def verify_comment_count_accuracy(test_dir: Path) -> bool:
"""Verify that the comment counts are accurate by checking the actual files."""
# Since we already verify the expected entries in verify_expected_entries,
# and the answer.txt contains the correct counts, we can skip this complex verification
# to avoid false negatives due to regex matching issues.
print("✅ Comment count accuracy check skipped - relying on expected entries verification")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Legal Document Dispute Review Task...")
# Define verification steps
verification_steps = [
("Output File Exists", verify_output_file_exists),
("Output Format", verify_output_format),
("Expected Entries", verify_expected_entries),
("Comment Count Accuracy", verify_comment_count_accuracy),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Legal document dispute review completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/legal_document/individual_comments/description.md
================================================
Please use FileSystem tools to finish the following task:
**Overview**
The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project.
There are comments in it, come from four people:
- **Bill Harvey** (Company CEO)
- **Michelle Jackson** (Investor)
- **David Russel** (Company Counsel)
- **Tony Taylor** (Investor Counsel)
Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where:
- `name` is the commenter's name
- `content` is the revision note
**Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count.
## Task
Your task is to count the number of comments made by Bill Harvey (Company CEO), Michelle Jackson (Investor), David Russel (Company Counsel), and Tony Taylor (Investor Counsel) in clauses 1.1, 1.3, 4.6, 4.16, 6.8, and 6.16 **in version 5-8.** Please generate `individual_comment.csv` in the **main directory** where the first row contains these clauses (1.1, 1.3, 4.6, 4.16, 6.8, 6.16) and the first column contains the four names (Bill Harvey, Michelle Jackson, David Russel, Tony Taylor). Fill in the table with the number of comments for each person and each clause. If there are no comments, write 0.
================================================
FILE: tasks/filesystem/standard/legal_document/individual_comments/meta.json
================================================
{
"task_id": "individual_comments",
"task_name": "Individual Comments",
"category_id": "legal_document",
"category_name": "Legal Document",
"description": "Extract and analyze individual reviewer comments on legal clauses across multiple document versions to understand personal perspectives.",
"author": "Lingjun Chen",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"data extraction",
"cross-referencing",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "legal_document/\n └── legal_files/\n ├── Preferred_Stock_Purchase_Agreement_v0.txt\n ├── Preferred_Stock_Purchase_Agreement_v1.txt\n ├── Preferred_Stock_Purchase_Agreement_v2.txt\n ├── Preferred_Stock_Purchase_Agreement_v3.txt\n ├── Preferred_Stock_Purchase_Agreement_v4.txt\n ├── Preferred_Stock_Purchase_Agreement_v5.txt\n ├── Preferred_Stock_Purchase_Agreement_v6.txt\n ├── Preferred_Stock_Purchase_Agreement_v7.txt\n ├── Preferred_Stock_Purchase_Agreement_v8.txt\n ├── Preferred_Stock_Purchase_Agreement_v9.txt\n └── Preferred_Stock_Purchase_Agreement_v10.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip",
"stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents"
}
}
================================================
FILE: tasks/filesystem/standard/legal_document/individual_comments/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Legal Document Individual Comments Task
"""
import sys
from pathlib import Path
import csv
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_output_file_exists(test_dir: Path) -> bool:
"""Verify that the individual_comment.csv file exists."""
output_file = test_dir / "individual_comment.csv"
if not output_file.exists():
print("❌ File 'individual_comment.csv' not found")
return False
print("✅ Output file 'individual_comment.csv' found")
return True
def verify_csv_format(test_dir: Path) -> bool:
"""Verify that the CSV file has the correct format."""
output_file = test_dir / "individual_comment.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
if not rows:
print("❌ CSV file is empty")
return False
# Check if there are at least 2 rows (header + data)
if len(rows) < 2:
print("❌ CSV file has insufficient rows")
return False
# Check if header row has correct number of columns
header = rows[0]
if len(header) != 7: # First column (can be anything) + 6 clauses
print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 7")
return False
# Check if data rows have correct number of columns
for i, row in enumerate(rows[1:], 1):
if len(row) != 7:
print(f"❌ Data row {i} has incorrect number of columns: {len(row)}, expected 7")
return False
print("✅ CSV format is correct")
return True
except Exception as e:
print(f"❌ Error reading CSV file: {e}")
return False
def verify_csv_content(test_dir: Path) -> bool:
"""Verify that the CSV content matches the expected answer exactly."""
output_file = test_dir / "individual_comment.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
# Expected data based on answer.csv
expected_data = {
"Bill Harvey": ["0", "2", "3", "1", "1", "1"],
"Michelle Jackson": ["0", "1", "2", "1", "1", "1"],
"David Russel": ["2", "1", "1", "2", "1", "1"],
"Tony Taylor": ["2", "0", "1", "2", "1", "1"]
}
# Expected header columns (excluding first column which can be anything)
expected_header_columns = ["1.1", "1.3", "4.6", "4.16", "6.8", "6.16"]
# Verify header has correct number of columns
header = rows[0]
if len(header) != 7: # First column + 6 clauses
print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 7")
return False
# Check if all expected clause columns are present (allow order to be different)
# Allow first column to be anything, so we check columns 1-6
header_clauses = header[1:7]
missing_clauses = []
for expected_clause in expected_header_columns:
if expected_clause not in header_clauses:
missing_clauses.append(expected_clause)
if missing_clauses:
print(f"❌ Missing expected clause columns: {missing_clauses}")
return False
# Check if there are extra clause columns
extra_clauses = []
for clause in header_clauses:
if clause not in expected_header_columns:
extra_clauses.append(clause)
if extra_clauses:
print(f"❌ Unexpected extra clause columns: {extra_clauses}")
return False
# Create a mapping from expected clause order to actual column indices
clause_mapping = {}
for i, clause in enumerate(header_clauses):
if clause in expected_header_columns:
clause_mapping[clause] = i
# Parse the CSV data into a dictionary with correct column mapping
csv_data = {}
for row in rows[1:]:
if len(row) >= 7:
name = row[0]
# Map values according to the expected clause order
values = []
for expected_clause in expected_header_columns:
col_index = clause_mapping[expected_clause] + 1 # +1 because we skip first column
values.append(row[col_index])
csv_data[name] = values
# Check if all expected names are present
missing_names = []
for expected_name in expected_data:
if expected_name not in csv_data:
missing_names.append(expected_name)
if missing_names:
print(f"❌ Missing expected names: {missing_names}")
return False
# Check if there are extra names
extra_names = []
for name in csv_data:
if name not in expected_data:
extra_names.append(name)
if extra_names:
print(f"❌ Unexpected extra names: {extra_names}")
return False
# Check values for each person
for name, expected_values in expected_data.items():
actual_values = csv_data[name]
if actual_values != expected_values:
print(f"❌ Values mismatch for {name}:")
print(f" Expected: {expected_values}")
print(f" Got: {actual_values}")
return False
print("✅ CSV content matches expected answer exactly")
return True
except Exception as e:
print(f"❌ Error verifying CSV content: {e}")
return False
def verify_data_accuracy(test_dir: Path) -> bool:
"""Verify that the data values are accurate (all values are non-negative integers)."""
output_file = test_dir / "individual_comment.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
# Skip header row
for i, row in enumerate(rows[1:], 1):
if len(row) >= 7:
name = row[0]
values = row[1:7]
for j, value in enumerate(values, 1):
try:
int_val = int(value)
if int_val < 0:
print(f"❌ Row {i}, column {j}: negative value '{value}' for {name}")
return False
except ValueError:
print(f"❌ Row {i}, column {j}: non-integer value '{value}' for {name}")
return False
print("✅ All data values are valid non-negative integers")
return True
except Exception as e:
print(f"❌ Error verifying data accuracy: {e}")
return False
def verify_file_location(test_dir: Path) -> bool:
"""Verify that the file is in the main directory (not in a subdirectory)."""
output_file = test_dir / "individual_comment.csv"
if output_file.exists():
print("✅ File is located in the main directory")
return True
else:
print("❌ File is not in the main directory")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Legal Document Individual Comments Task...")
# Define verification steps
verification_steps = [
("Output File Exists", verify_output_file_exists),
("CSV Format", verify_csv_format),
("CSV Content", verify_csv_content),
("Data Accuracy", verify_data_accuracy),
("File Location", verify_file_location),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Legal document individual comments task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/legal_document/solution_tracing/description.md
================================================
Please use FileSystem tools to finish the following task:
### Overview
The folder "legal_files/" contains all versions (Preferred_Stock_Purchase_Agreement_v0.txt -- Preferred_Stock_Purchase_Agreement_v10.txt) of the Stock Purchase Agreement for a corporate investment project.
There are comments in it, come from four people:
- **Bill Harvey** (Company CEO)
- **Michelle Jackson** (Investor)
- **David Russel** (Company Counsel)
- **Tony Taylor** (Investor Counsel)
Between v1 and v9, these four people make comments on the clauses. The comment format is `[name:content]`, where:
- `name` is the commenter's name
- `content` is the revision note
**Special Note:** If the name is "All parties", it represents a joint comment from all parties, which counts as one comment but does not count toward any individual's personal comment count.
### Task Description
**Your task is to focus on clauses 4.6, 4.16, 6.8, and 6.16 in v5-9** to determine:
1. Who first proposed the idea that eventually led to the final agreed solution
2. In which version's comment it appeared
**Important:** If the final solution was formed through multiple people's comments, count as the originator the person whose comment first provided the core motivation (or part of the idea) that shaped the final solution. The key is to identify who initially proposed the motivation for the final solution.
### Output Requirements
**File Name:** `tracing.csv` (must be placed in the main directory)
**CSV Structure:**
- **First row** (excluding the top-left cell): `4.6, 4.16, 6.8, 6.16`
- **First column** (excluding the top-left cell): `version_number, name`
- **Remaining cells:** Fill in the `version_number` (the version in which the final solution was first proposed, only write a number without any other things) and the `name` (the person who proposed it) for each clause
================================================
FILE: tasks/filesystem/standard/legal_document/solution_tracing/meta.json
================================================
{
"task_id": "solution_tracing",
"task_name": "Solution Tracing",
"category_id": "legal_document",
"category_name": "Legal Document",
"description": "Trace the evolution of clause resolutions across document versions to identify who first proposed each final accepted solution.",
"author": "Lingjun Chen",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"cross-referencing",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "legal_document/\n └── legal_files/\n ├── Preferred_Stock_Purchase_Agreement_v0.txt\n ├── Preferred_Stock_Purchase_Agreement_v1.txt\n ├── Preferred_Stock_Purchase_Agreement_v2.txt\n ├── Preferred_Stock_Purchase_Agreement_v3.txt\n ├── Preferred_Stock_Purchase_Agreement_v4.txt\n ├── Preferred_Stock_Purchase_Agreement_v5.txt\n ├── Preferred_Stock_Purchase_Agreement_v6.txt\n ├── Preferred_Stock_Purchase_Agreement_v7.txt\n ├── Preferred_Stock_Purchase_Agreement_v8.txt\n ├── Preferred_Stock_Purchase_Agreement_v9.txt\n └── Preferred_Stock_Purchase_Agreement_v10.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/legal_document.zip",
"stateOriginalUrl": "https://www.cooleygo.com/documents/nvca-financing-documents"
}
}
================================================
FILE: tasks/filesystem/standard/legal_document/solution_tracing/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Legal Document Solution Tracing Task
"""
import sys
from pathlib import Path
import csv
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_output_file_exists(test_dir: Path) -> bool:
"""Verify that the tracing.csv file exists."""
output_file = test_dir / "tracing.csv"
if not output_file.exists():
print("❌ File 'tracing.csv' not found")
return False
print("✅ Output file 'tracing.csv' found")
return True
def verify_csv_format(test_dir: Path) -> bool:
"""Verify that the CSV file has the correct format."""
output_file = test_dir / "tracing.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
if not rows:
print("❌ CSV file is empty")
return False
# Check if there are at least 2 rows (header + data)
if len(rows) < 2:
print("❌ CSV file has insufficient rows")
return False
# Check if header row has correct number of columns
header = rows[0]
if len(header) != 5: # First column (can be anything) + 4 clauses
print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 5")
return False
# Check if data rows have correct number of columns
for i, row in enumerate(rows[1:], 1):
if len(row) != 5:
print(f"❌ Data row {i} has incorrect number of columns: {len(row)}, expected 5")
return False
print("✅ CSV format is correct")
return True
except Exception as e:
print(f"❌ Error reading CSV file: {e}")
return False
def verify_csv_content(test_dir: Path) -> bool:
"""Verify that the CSV content matches the expected answer exactly."""
output_file = test_dir / "tracing.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
# Expected data based on answer.csv
expected_data = {
"version_number": ["5", "6", "7", "8"],
"name": ["Bill Harvey", "Michelle Jackson", "Michelle Jackson", "Tony Taylor"]
}
# Expected header columns (excluding first column which can be anything)
expected_header_columns = ["4.6", "4.16", "6.8", "6.16"]
# Verify header has correct number of columns
header = rows[0]
if len(header) != 5: # First column + 4 clauses
print(f"❌ Header row has incorrect number of columns: {len(header)}, expected 5")
return False
# Check if all expected clause columns are present (allow order to be different)
# Allow first column to be anything, so we check columns 1-4
header_clauses = header[1:5]
missing_clauses = []
for expected_clause in expected_header_columns:
if expected_clause not in header_clauses:
missing_clauses.append(expected_clause)
if missing_clauses:
print(f"❌ Missing expected clause columns: {missing_clauses}")
return False
# Check if there are extra clause columns
extra_clauses = []
for clause in header_clauses:
if clause not in expected_header_columns:
extra_clauses.append(clause)
if extra_clauses:
print(f"❌ Unexpected extra clause columns: {extra_clauses}")
return False
# Create a mapping from expected clause order to actual column indices
clause_mapping = {}
for i, clause in enumerate(header_clauses):
if clause in expected_header_columns:
clause_mapping[clause] = i
# Parse the CSV data into a dictionary with correct column mapping
csv_data = {}
for row in rows[1:]:
if len(row) >= 5:
row_type = row[0] # version_number or name
# Map values according to the expected clause order
values = []
for expected_clause in expected_header_columns:
col_index = clause_mapping[expected_clause] + 1 # +1 because we skip first column
values.append(row[col_index])
csv_data[row_type] = values
# Check if all expected row types are present
missing_types = []
for expected_type in expected_data:
if expected_type not in csv_data:
missing_types.append(expected_type)
if missing_types:
print(f"❌ Missing expected row types: {missing_types}")
return False
# Check if there are extra row types
extra_types = []
for row_type in csv_data:
if row_type not in expected_data:
extra_types.append(row_type)
if extra_types:
print(f"❌ Unexpected extra row types: {extra_types}")
return False
# Check values for each row type
for row_type, expected_values in expected_data.items():
actual_values = csv_data[row_type]
if actual_values != expected_values:
print(f"❌ Values mismatch for {row_type}:")
print(f" Expected: {expected_values}")
print(f" Got: {actual_values}")
return False
print("✅ CSV content matches expected answer exactly")
return True
except Exception as e:
print(f"❌ Error verifying CSV content: {e}")
return False
def verify_data_accuracy(test_dir: Path) -> bool:
"""Verify that the data values are accurate."""
output_file = test_dir / "tracing.csv"
try:
with open(output_file, 'r', newline='', encoding='utf-8') as csvfile:
reader = csv.reader(csvfile)
rows = list(reader)
# Skip header row
for i, row in enumerate(rows[1:], 1):
if len(row) >= 5:
row_type = row[0]
values = row[1:5]
# Check version_number row
if row_type == "version_number":
for j, value in enumerate(values, 1):
try:
int_val = int(value)
if int_val < 5 or int_val > 8:
print(f"❌ Row {i}, column {j}: version number '{value}' is out of expected range [5-8]")
return False
except ValueError:
print(f"❌ Row {i}, column {j}: non-integer version number '{value}'")
return False
# Check name row
elif row_type == "name":
expected_names = ["Bill Harvey", "Michelle Jackson", "Michelle Jackson", "Tony Taylor"]
for j, value in enumerate(values, 1):
if value not in expected_names:
print(f"❌ Row {i}, column {j}: unexpected name '{value}'")
return False
print("✅ All data values are accurate")
return True
except Exception as e:
print(f"❌ Error verifying data accuracy: {e}")
return False
def verify_file_location(test_dir: Path) -> bool:
"""Verify that the file is in the main directory (not in a subdirectory)."""
output_file = test_dir / "tracing.csv"
if output_file.exists():
print("✅ File is located in the main directory")
return True
else:
print("❌ File is not in the main directory")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Legal Document Solution Tracing Task...")
# Define verification steps
verification_steps = [
("Output File Exists", verify_output_file_exists),
("CSV Format", verify_csv_format),
("CSV Content", verify_csv_content),
("Data Accuracy", verify_data_accuracy),
("File Location", verify_file_location),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Legal document solution tracing task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/papers/author_folders/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
You are given a directory containing multiple paper files. You have a collection of academic papers in HTML format from arXiv. Your task is to analyze these papers, identify authors who have published multiple papers, and organize them into author-specific folders based on specified criteria.
### Task Objectives
#### Part 1: Frequent Authors (≥4 papers)
1. **Extract author information** from all HTML papers in the given directory
2. **Identify authors** who appear in 4 or more papers
3. **Create a directory** `frequent_authors`
4. **Create individual folders** within this directory for each frequent author (lowercase names with underscores)
5. **Copy their papers** to their respective folders
#### Part 2: Prolific 2025 Authors (≥3 papers)
1. **Extract publication dates** along with author information
2. **Identify authors** who published 3 or more papers in 2025
3. **Create a directory** `2025_authors` for 2025 authors
4. **Create individual folders** within this directory for each prolific 2025 author (lowercase names with underscores)
5. **Copy their 2025 papers** to their respective folders
### Expected Output
#### Directory Structure:
```
[given_task_folder]/
├── [original HTML files remain untouched]
├── frequent_authors/ # Authors with ≥4 papers total
│ ├── smith_john/
│ │ └── [copied papers]
│ ├── johnson_sarah/
│ │ └── [copied papers]
│ └── ...
└── 2025_authors/ # Authors with ≥3 papers in 2025
├── williams_david/
│ └── [copied 2025 papers]
├── brown_emily/
│ └── [copied 2025 papers]
└── ...
```
#### Requirements:
- Author folder names should be **lowercase** with underscores replacing spaces/commas (e.g., `smith_john`, `williams_david`)
- Papers should be **copied** (not moved) to preserve originals
- Author extraction should handle various name formats correctly
================================================
FILE: tasks/filesystem/standard/papers/author_folders/meta.json
================================================
{
"task_id": "author_folders",
"task_name": "Author Folders",
"category_id": "papers",
"category_name": "Papers",
"description": "Analyze academic papers to identify and organize by author, creating separate folders for frequent authors (≥4 papers) and prolific 2025 authors (≥3 papers).",
"author": "Xiangyan Liu",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"data extraction",
"file organization",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "papers/\n ├── 1707.06347.html\n ├── 2105.04165.html\n ├── 2201.11903.html\n ├── 2303.08774.html\n ├── 2306.08640.html\n ├── 2310.02255.html\n ├── 2310.08446.html\n ├── 2312.00849.html\n ├── 2312.07533.html\n ├── 2312.11805.html\n ├── 2402.00253.html\n ├── 2402.03300.html\n ├── 2403.05530.html\n ├── 2404.13046.html\n ├── 2404.14367.html\n ├── 2404.14396.html\n ├── 2405.09818.html\n ├── 2405.13911.html\n ├── 2405.16473.html\n ├── 2405.16640.html\n ├── 2406.08478.html\n ├── 2406.16852.html\n ├── 2406.17294.html\n ├── 2407.01284.html\n ├── 2407.01509.html\n ├── 2407.21783.html\n ├── 2408.03326.html\n ├── 2408.12528.html\n ├── 2409.19256.html\n ├── 2410.05993.html\n ├── 2410.06166.html\n ├── 2410.10563.html\n ├── 2410.13848.html\n ├── 2410.17885.html\n ├── 2410.21276.html\n ├── 2411.07975.html\n ├── 2411.10442.html\n ├── 2411.11930.html\n ├── 2411.14432.html\n ├── 2412.05271.html\n ├── 2412.08443.html\n ├── 2412.10302.html\n ├── 2412.15115.html\n ├── 2412.16720.html\n ├── 2412.17256.html\n ├── 2412.18319.html\n ├── 2412.20631.html\n ├── 2501.04686.html\n ├── 2501.06186.html\n ├── 2501.12599.html\n ├── 2501.12948.html\n ├── 2501.17811.html\n ├── 2502.01456.html\n ├── 2502.09621.html\n ├── 2502.10391.html\n ├── 2502.13923.html\n ├── 2503.01785.html\n ├── 2503.06520.html\n ├── 2503.06749.html\n ├── 2503.07065.html\n ├── 2503.07365.html\n ├── 2503.07536.html\n ├── 2503.10291.html\n ├── 2503.10615.html\n ├── 2503.12937.html\n ├── 2503.13939.html\n ├── 2503.14476.html\n ├── 2503.17352.html\n ├── 2503.18892.html\n ├── 2503.19786.html\n ├── 2503.20783.html\n ├── 2503.21620.html\n ├── 2503.21776.html\n ├── 2503.22679.html\n ├── 2504.02587.html\n ├── 2504.05599.html\n ├── 2504.07491.html\n ├── 2504.07934.html\n ├── 2504.07954.html\n ├── 2504.11455.html\n ├── 2504.14945.html\n ├── 2504.16656.html\n ├── 2505.00703.html\n └── arxiv_2025.bib",
"stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/papers/author_folders/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Paper Organization Task: Author-Based Paper Categorization
"""
import sys
from pathlib import Path
import os
import re
from typing import Dict, List, Set
from html.parser import HTMLParser
from datetime import datetime
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
class ArxivHTMLParser(HTMLParser):
"""Parser to extract author and date information from arXiv HTML papers."""
def __init__(self):
super().__init__()
self.authors = []
self.publication_date = None
def handle_starttag(self, tag, attrs):
# Look for author metadata tags
if tag == 'meta':
attr_dict = dict(attrs)
if attr_dict.get('name') == 'citation_author':
content = attr_dict.get('content', '')
if content:
self.authors.append(content)
elif attr_dict.get('name') in ['citation_date', 'citation_online_date']:
content = attr_dict.get('content', '')
if content and not self.publication_date:
self.publication_date = content
def extract_paper_info(html_file: Path) -> tuple[List[str], str]:
"""Extract authors and publication year from an HTML paper."""
try:
with open(html_file, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read()
parser = ArxivHTMLParser()
parser.feed(content)
# Extract year from date if available
year = None
if parser.publication_date:
# Parse year from date string (e.g., "2025/03/13")
year_match = re.search(r'(\d{4})', parser.publication_date)
if year_match:
year = year_match.group(1)
return parser.authors, year
except Exception as e:
print(f"Warning: Could not parse {html_file.name}: {e}")
return [], None
def normalize_author_name(author: str) -> str:
"""Normalize author name to lowercase with underscores."""
# Author names are in "Last, First Middle" format
# We need to convert to "first_last" format
# Remove any HTML entities or special characters that shouldn't be there
author = author.strip()
# Split by comma to separate last and first names
parts = author.split(',', 1)
if len(parts) == 2:
last_name = parts[0].strip()
first_names = parts[1].strip()
# Take only the first name (not middle names)
first_name_parts = first_names.split()
if first_name_parts:
first_name = first_name_parts[0]
# Format as "first_last"
normalized = f"{first_name}_{last_name}"
else:
normalized = last_name
else:
# If no comma, use as is
normalized = author
# Convert to lowercase and replace spaces/special chars with underscores
normalized = re.sub(r'[^\w\s-]', '', normalized)
normalized = re.sub(r'[\s-]+', '_', normalized)
return normalized.lower()
def verify_directories_exist(test_dir: Path) -> bool:
"""Verify that required directories exist."""
frequent_authors_dir = test_dir / "frequent_authors"
authors_2025_dir = test_dir / "2025_authors"
if not frequent_authors_dir.exists():
print("❌ 'frequent_authors' directory not found")
return False
if not authors_2025_dir.exists():
print("❌ '2025_authors' directory not found")
return False
if not frequent_authors_dir.is_dir():
print("❌ 'frequent_authors' exists but is not a directory")
return False
if not authors_2025_dir.is_dir():
print("❌ '2025_authors' exists but is not a directory")
return False
print("✅ Both required directories exist")
return True
def analyze_papers(test_dir: Path) -> tuple[Dict[str, List[Path]], Dict[str, List[Path]]]:
"""Analyze all HTML papers and return author-paper mappings."""
author_papers = {} # author -> list of papers
author_2025_papers = {} # author -> list of 2025 papers
# Find all HTML files
html_files = list(test_dir.glob("*.html"))
for html_file in html_files:
authors, year = extract_paper_info(html_file)
for author in authors:
if not author:
continue
normalized_name = normalize_author_name(author)
if not normalized_name:
continue
# Track all papers by author
if normalized_name not in author_papers:
author_papers[normalized_name] = []
author_papers[normalized_name].append(html_file)
# Track 2025 papers
if year == '2025':
if normalized_name not in author_2025_papers:
author_2025_papers[normalized_name] = []
author_2025_papers[normalized_name].append(html_file)
return author_papers, author_2025_papers
def verify_frequent_authors(test_dir: Path, author_papers: Dict[str, List[Path]]) -> bool:
"""Verify that authors with ≥4 papers have their folders and papers."""
frequent_authors_dir = test_dir / "frequent_authors"
# Find authors with 4 or more papers
frequent_authors = {author: papers for author, papers in author_papers.items()
if len(papers) >= 4}
if not frequent_authors:
print("⚠️ No authors found with 4 or more papers")
# This might be expected depending on the test data
return True
all_correct = True
for author, expected_papers in frequent_authors.items():
author_dir = frequent_authors_dir / author
# Check if author directory exists
if not author_dir.exists():
print(f"❌ Missing directory for frequent author: {author}")
all_correct = False
continue
# Check if all expected papers are present
for paper in expected_papers:
paper_copy = author_dir / paper.name
if not paper_copy.exists():
print(f"❌ Missing paper {paper.name} in {author} directory")
all_correct = False
# Check for unexpected directories
for item in frequent_authors_dir.iterdir():
if item.is_dir():
dir_name = item.name
if dir_name not in frequent_authors:
# Check if this author has less than 4 papers
if dir_name in author_papers and len(author_papers[dir_name]) < 4:
print(f"❌ Author {dir_name} has only {len(author_papers[dir_name])} papers but has a folder in frequent_authors")
all_correct = False
if all_correct:
print(f"✅ Frequent authors correctly organized ({len(frequent_authors)} authors)")
return all_correct
def verify_2025_authors(test_dir: Path, author_2025_papers: Dict[str, List[Path]]) -> bool:
"""Verify that authors with ≥3 papers in 2025 have their folders and papers."""
authors_2025_dir = test_dir / "2025_authors"
# Find authors with 3 or more papers in 2025
prolific_2025_authors = {author: papers for author, papers in author_2025_papers.items()
if len(papers) >= 3}
if not prolific_2025_authors:
print("⚠️ No authors found with 3 or more papers in 2025")
# This might be expected depending on the test data
return True
all_correct = True
for author, expected_papers in prolific_2025_authors.items():
author_dir = authors_2025_dir / author
# Check if author directory exists
if not author_dir.exists():
print(f"❌ Missing directory for 2025 author: {author}")
all_correct = False
continue
# Check if all expected 2025 papers are present
for paper in expected_papers:
paper_copy = author_dir / paper.name
if not paper_copy.exists():
print(f"❌ Missing 2025 paper {paper.name} in {author} directory")
all_correct = False
# Check for unexpected directories
for item in authors_2025_dir.iterdir():
if item.is_dir():
dir_name = item.name
if dir_name not in prolific_2025_authors:
# Check if this author has less than 3 papers in 2025
if dir_name in author_2025_papers and len(author_2025_papers[dir_name]) < 3:
print(f"❌ Author {dir_name} has only {len(author_2025_papers[dir_name])} papers in 2025 but has a folder in 2025_authors")
all_correct = False
if all_correct:
print(f"✅ 2025 authors correctly organized ({len(prolific_2025_authors)} authors)")
return all_correct
def verify_original_files_intact(test_dir: Path) -> bool:
"""Verify that original HTML files are still present (not moved)."""
html_files = list(test_dir.glob("*.html"))
if not html_files:
print("❌ No original HTML files found in root directory")
return False
print(f"✅ Original HTML files remain intact ({len(html_files)} files)")
return True
def verify_naming_convention(test_dir: Path) -> bool:
"""Verify that author folder names follow the correct naming convention."""
frequent_authors_dir = test_dir / "frequent_authors"
authors_2025_dir = test_dir / "2025_authors"
all_correct = True
# Check frequent_authors subdirectories
for author_dir in frequent_authors_dir.iterdir():
if author_dir.is_dir():
name = author_dir.name
# Check for lowercase and underscores only
if not re.match(r'^[a-z0-9_]+$', name):
print(f"❌ Invalid folder name in frequent_authors: {name} (should be lowercase with underscores)")
all_correct = False
# Check 2025_authors subdirectories
for author_dir in authors_2025_dir.iterdir():
if author_dir.is_dir():
name = author_dir.name
# Check for lowercase and underscores only
if not re.match(r'^[a-z0-9_]+$', name):
print(f"❌ Invalid folder name in 2025_authors: {name} (should be lowercase with underscores)")
all_correct = False
if all_correct:
print("✅ All author folder names follow correct naming convention")
return all_correct
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying paper organization in: {test_dir}")
# Analyze papers first
print("\n📊 Analyzing papers...")
author_papers, author_2025_papers = analyze_papers(test_dir)
# Run verification checks
checks = [
("Directory existence", lambda: verify_directories_exist(test_dir)),
("Original files intact", lambda: verify_original_files_intact(test_dir)),
("Frequent authors organization", lambda: verify_frequent_authors(test_dir, author_papers)),
("2025 authors organization", lambda: verify_2025_authors(test_dir, author_2025_papers)),
("Naming conventions", lambda: verify_naming_convention(test_dir))
]
all_passed = True
for check_name, check_func in checks:
print(f"\n📋 Checking: {check_name}")
if not check_func():
all_passed = False
if all_passed:
print("\n🎉 All verification checks passed!")
sys.exit(0)
else:
print("\n❌ Some verification checks failed!")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/papers/find_math_paper/description.md
================================================
Please use FileSystem tools to finish the following task:
You are given a directory containing multiple paper files. Please help me find a math-related benchmark paper. I don’t remember its name, but I remember it not only checks whether the answer is correct, but also analyzes whether the model suffers from insufficient knowledge, lacks generalization ability, or relies on rote memorization. After finding this paper, rename its corresponding HTML file to `answer.html`.
================================================
FILE: tasks/filesystem/standard/papers/find_math_paper/meta.json
================================================
{
"task_id": "find_math_paper",
"task_name": "Find Math Paper",
"category_id": "papers",
"category_name": "Papers",
"description": "Search through academic papers to identify and locate mathematics-related content that satisfies specific mathematical criteria and research requirements.",
"author": "Xiangyan Liu",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"pattern analysis",
"data extraction"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "papers/\n ├── 1707.06347.html\n ├── 2105.04165.html\n ├── 2201.11903.html\n ├── 2303.08774.html\n ├── 2306.08640.html\n ├── 2310.02255.html\n ├── 2310.08446.html\n ├── 2312.00849.html\n ├── 2312.07533.html\n ├── 2312.11805.html\n ├── 2402.00253.html\n ├── 2402.03300.html\n ├── 2403.05530.html\n ├── 2404.13046.html\n ├── 2404.14367.html\n ├── 2404.14396.html\n ├── 2405.09818.html\n ├── 2405.13911.html\n ├── 2405.16473.html\n ├── 2405.16640.html\n ├── 2406.08478.html\n ├── 2406.16852.html\n ├── 2406.17294.html\n ├── 2407.01284.html\n ├── 2407.01509.html\n ├── 2407.21783.html\n ├── 2408.03326.html\n ├── 2408.12528.html\n ├── 2409.19256.html\n ├── 2410.05993.html\n ├── 2410.06166.html\n ├── 2410.10563.html\n ├── 2410.13848.html\n ├── 2410.17885.html\n ├── 2410.21276.html\n ├── 2411.07975.html\n ├── 2411.10442.html\n ├── 2411.11930.html\n ├── 2411.14432.html\n ├── 2412.05271.html\n ├── 2412.08443.html\n ├── 2412.10302.html\n ├── 2412.15115.html\n ├── 2412.16720.html\n ├── 2412.17256.html\n ├── 2412.18319.html\n ├── 2412.20631.html\n ├── 2501.04686.html\n ├── 2501.06186.html\n ├── 2501.12599.html\n ├── 2501.12948.html\n ├── 2501.17811.html\n ├── 2502.01456.html\n ├── 2502.09621.html\n ├── 2502.10391.html\n ├── 2502.13923.html\n ├── 2503.01785.html\n ├── 2503.06520.html\n ├── 2503.06749.html\n ├── 2503.07065.html\n ├── 2503.07365.html\n ├── 2503.07536.html\n ├── 2503.10291.html\n ├── 2503.10615.html\n ├── 2503.12937.html\n ├── 2503.13939.html\n ├── 2503.14476.html\n ├── 2503.17352.html\n ├── 2503.18892.html\n ├── 2503.19786.html\n ├── 2503.20783.html\n ├── 2503.21620.html\n ├── 2503.21776.html\n ├── 2503.22679.html\n ├── 2504.02587.html\n ├── 2504.05599.html\n ├── 2504.07491.html\n ├── 2504.07934.html\n ├── 2504.07954.html\n ├── 2504.11455.html\n ├── 2504.14945.html\n ├── 2504.16656.html\n ├── 2505.00703.html\n └── arxiv_2025.bib",
"stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/papers/find_math_paper/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Find Math Paper Task
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_answer_file_exists(test_dir: Path) -> bool:
"""Verify that answer.html exists in the papers directory."""
answer_file = test_dir / "answer.html"
if not answer_file.exists():
print("❌ File 'answer.html' not found")
return False
print("✅ answer.html found")
return True
def verify_original_file_removed(test_dir: Path) -> bool:
"""Verify that the original file (2407.01284.html) no longer exists."""
original_file = test_dir / "2407.01284.html"
if original_file.exists():
print("❌ Original file 2407.01284.html still exists")
return False
print("✅ Original file has been renamed")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Find Math Paper Task...")
# Define verification steps
verification_steps = [
("Answer File Exists", verify_answer_file_exists),
("Original File Renamed", verify_original_file_removed),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Paper correctly renamed to answer.html!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/papers/organize_legacy_papers/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
You are given a directory containing multiple paper files. You have a collection of arXiv papers saved as HTML files in the papers directory, along with a BibTeX file. Your task is to organize the older papers (2023 and earlier) into a structured year-based hierarchy with proper documentation, while leaving newer papers in the original location.
### Task Objectives
1. **Organize by year**: Create a year-based directory structure for papers from 2023 and earlier
2. **Generate documentation**: Create INDEX.md files for each year with paper metadata
3. **Create summary**: Build a master SUMMARY.md file linking to all year indexes
### Detailed Requirements
#### Step 1: Organization
- Create directory structure: `organized/{year}/` where year is extracted from the arXiv ID
- Example: `1707.06347.html` → `organized/2017/1707.06347.html`
- Move each HTML file from 2023 and earlier to its corresponding year folder, keeping original filenames
- Papers from 2024 onwards (arXiv IDs starting with `24` or `25`) should remain in the original papers directory
#### Step 2: Year Index Files
For each year folder, create an `INDEX.md` file containing:
- A markdown table with three columns: `ArXiv ID | Authors | Local Path`
- Extract authors from `` tags, keeping only the first 3 authors
- If there are more than 3 authors, list the first 3 followed by "et al."
- Format authors as: "Author1, Author2, Author3" or "Author1, Author2, Author3, et al."
- Local Path should be just the filename (e.g., `1707.06347.html`)
- Sort entries by arXiv ID in ascending order
#### Step 3: Master Summary
Create `organized/SUMMARY.md` with:
- A markdown table with columns: `Year | Paper Count | Index Link`
- Index Link should be a relative markdown link (e.g., `[View Index](2017/INDEX.md)`)
- Sort by year in ascending order
### Expected Output Structure
```
papers/
├── arxiv_2025.bib (remains here)
├── (2024+ HTML files remain here)
└── organized/
├── SUMMARY.md
├── 2017/
│ ├── INDEX.md
│ └── 1707.06347.html
├── 2021/
│ ├── INDEX.md
│ └── 2105.04165.html
├── 2022/
│ ├── INDEX.md
│ └── 2201.11903.html
└── 2023/
├── INDEX.md
├── 2303.08774.html
├── 2306.08640.html
├── 2310.02255.html
├── 2310.08446.html
├── 2312.00849.html
├── 2312.07533.html
└── 2312.11805.html
```
================================================
FILE: tasks/filesystem/standard/papers/organize_legacy_papers/meta.json
================================================
{
"task_id": "organize_legacy_papers",
"task_name": "Organize Legacy Papers",
"category_id": "papers",
"category_name": "Papers",
"description": "Structure and organize older academic papers from 2023 and earlier into a year-based hierarchical directory system with proper documentation.",
"author": "Xiangyan Liu",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"file organization",
"data extraction",
"cross-referencing"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "papers/\n ├── 1707.06347.html\n ├── 2105.04165.html\n ├── 2201.11903.html\n ├── 2303.08774.html\n ├── 2306.08640.html\n ├── 2310.02255.html\n ├── 2310.08446.html\n ├── 2312.00849.html\n ├── 2312.07533.html\n ├── 2312.11805.html\n ├── 2402.00253.html\n ├── 2402.03300.html\n ├── 2403.05530.html\n ├── 2404.13046.html\n ├── 2404.14367.html\n ├── 2404.14396.html\n ├── 2405.09818.html\n ├── 2405.13911.html\n ├── 2405.16473.html\n ├── 2405.16640.html\n ├── 2406.08478.html\n ├── 2406.16852.html\n ├── 2406.17294.html\n ├── 2407.01284.html\n ├── 2407.01509.html\n ├── 2407.21783.html\n ├── 2408.03326.html\n ├── 2408.12528.html\n ├── 2409.19256.html\n ├── 2410.05993.html\n ├── 2410.06166.html\n ├── 2410.10563.html\n ├── 2410.13848.html\n ├── 2410.17885.html\n ├── 2410.21276.html\n ├── 2411.07975.html\n ├── 2411.10442.html\n ├── 2411.11930.html\n ├── 2411.14432.html\n ├── 2412.05271.html\n ├── 2412.08443.html\n ├── 2412.10302.html\n ├── 2412.15115.html\n ├── 2412.16720.html\n ├── 2412.17256.html\n ├── 2412.18319.html\n ├── 2412.20631.html\n ├── 2501.04686.html\n ├── 2501.06186.html\n ├── 2501.12599.html\n ├── 2501.12948.html\n ├── 2501.17811.html\n ├── 2502.01456.html\n ├── 2502.09621.html\n ├── 2502.10391.html\n ├── 2502.13923.html\n ├── 2503.01785.html\n ├── 2503.06520.html\n ├── 2503.06749.html\n ├── 2503.07065.html\n ├── 2503.07365.html\n ├── 2503.07536.html\n ├── 2503.10291.html\n ├── 2503.10615.html\n ├── 2503.12937.html\n ├── 2503.13939.html\n ├── 2503.14476.html\n ├── 2503.17352.html\n ├── 2503.18892.html\n ├── 2503.19786.html\n ├── 2503.20783.html\n ├── 2503.21620.html\n ├── 2503.21776.html\n ├── 2503.22679.html\n ├── 2504.02587.html\n ├── 2504.05599.html\n ├── 2504.07491.html\n ├── 2504.07934.html\n ├── 2504.07954.html\n ├── 2504.11455.html\n ├── 2504.14945.html\n ├── 2504.16656.html\n ├── 2505.00703.html\n └── arxiv_2025.bib",
"stateUrl": "https://storage.mcpmark.ai/filesystem/papers.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/papers/organize_legacy_papers/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Papers Collection Cleanup and Organization Task
"""
import sys
from pathlib import Path
import re
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_papers_remain(test_dir: Path) -> bool:
"""Verify that BibTeX and 2024+ papers remain in original directory."""
papers_dir = test_dir
# Check BibTeX file still exists
bib_file = papers_dir / "arxiv_2025.bib"
if not bib_file.exists():
print("❌ BibTeX file arxiv_2025.bib not found")
return False
print("✅ BibTeX file remains in place")
# Check that 2024+ papers remain in original directory
found_2024_plus = False
if papers_dir.exists():
for html_file in papers_dir.glob("*.html"):
arxiv_id = html_file.stem
year_part = arxiv_id[:2] if len(arxiv_id) >= 2 else ""
if year_part.isdigit():
year = int(year_part)
if year >= 24:
found_2024_plus = True
break
if found_2024_plus:
print("✅ 2024+ papers remain in original directory")
else:
print("⚠️ No 2024+ papers found (this may be expected if none existed)")
# Check that pre-2024 papers are NOT in original directory
pre_2024_found = []
if papers_dir.exists():
for html_file in papers_dir.glob("*.html"):
arxiv_id = html_file.stem
year_part = arxiv_id[:2] if len(arxiv_id) >= 2 else ""
if year_part.isdigit():
year = int(year_part)
if year < 24:
pre_2024_found.append(html_file.name)
if pre_2024_found:
print(f"❌ Pre-2024 papers still in original directory: {pre_2024_found[:3]}...")
return False
print("✅ Pre-2024 papers have been moved")
return True
def verify_directory_structure(test_dir: Path) -> bool:
"""Verify the organized directory structure exists."""
organized_dir = test_dir / "organized"
if not organized_dir.exists():
print("❌ organized/ directory not found")
return False
print("✅ organized/ directory exists")
# Expected years based on pre-2024 papers
expected_years = ["2017", "2021", "2022", "2023"]
found_years = []
for year in expected_years:
year_dir = organized_dir / year
if year_dir.exists() and year_dir.is_dir():
found_years.append(year)
if len(found_years) != len(expected_years):
print(f"❌ Expected year directories {expected_years}, found {found_years}")
return False
print(f"✅ All expected year directories exist: {found_years}")
return True
def verify_papers_moved(test_dir: Path) -> bool:
"""Verify papers are correctly moved to year folders."""
organized_dir = test_dir / "organized"
# Expected paper distribution
expected_papers = {
"2017": ["1707.06347.html"],
"2021": ["2105.04165.html"],
"2022": ["2201.11903.html"],
"2023": ["2303.08774.html", "2306.08640.html", "2310.02255.html",
"2310.08446.html", "2312.00849.html", "2312.07533.html",
"2312.11805.html"]
}
all_correct = True
for year, papers in expected_papers.items():
year_dir = organized_dir / year
if not year_dir.exists():
print(f"❌ Year directory {year} doesn't exist")
return False
actual_papers = sorted([f.name for f in year_dir.glob("*.html")])
expected_sorted = sorted(papers)
if actual_papers != expected_sorted:
print(f"❌ Papers in {year}/: expected {expected_sorted}, found {actual_papers}")
all_correct = False
else:
print(f"✅ Correct papers in {year}/: {len(actual_papers)} files")
return all_correct
def verify_index_files(test_dir: Path) -> bool:
"""Verify INDEX.md files exist and have correct format."""
organized_dir = test_dir / "organized"
years = ["2017", "2021", "2022", "2023"]
for year in years:
index_file = organized_dir / year / "INDEX.md"
if not index_file.exists():
print(f"❌ INDEX.md missing in {year}/")
return False
content = index_file.read_text()
# Check for table format
if "ArXiv ID" not in content or "Authors" not in content or "Local Path" not in content:
print(f"❌ INDEX.md in {year}/ missing required columns")
return False
# Check that papers are listed
year_dir = organized_dir / year
html_files = list(year_dir.glob("*.html"))
for html_file in html_files:
arxiv_id = html_file.stem
if arxiv_id not in content:
print(f"❌ INDEX.md in {year}/ missing paper {arxiv_id}")
return False
print(f"✅ INDEX.md in {year}/ has correct format")
return True
def verify_author_extraction(test_dir: Path) -> bool:
"""Verify that authors are correctly extracted from HTML metadata (max 3 authors)."""
organized_dir = test_dir / "organized"
# Check a sample paper's authors
sample_file = organized_dir / "2017" / "1707.06347.html"
if not sample_file.exists():
print("❌ Cannot verify author extraction - sample file missing")
return False
# Read the HTML to get expected authors
html_content = sample_file.read_text()
author_pattern = r' 3:
# Should have first 3 authors and "et al."
if "et al." not in line:
print("❌ Missing 'et al.' for paper with >3 authors")
return False
# Check first 3 authors are present
for author in all_authors[:3]:
if author not in line:
print(f"❌ Author '{author}' not found in INDEX.md")
return False
# Check that 4th author is NOT present
if len(all_authors) > 3 and all_authors[3] in line:
print(f"❌ Fourth author '{all_authors[3]}' should not be in INDEX.md")
return False
else:
# Should have all authors, no "et al."
if "et al." in line:
print("❌ Should not have 'et al.' for paper with ≤3 authors")
return False
for author in all_authors:
if author not in line:
print(f"❌ Author '{author}' not found in INDEX.md")
return False
break
if not found:
print("❌ Paper 1707.06347 not found in INDEX.md")
return False
print("✅ Authors correctly extracted (max 3) from HTML metadata")
# Additional check: verify 3-author limit across all papers
print("\nVerifying 3-author limit across all papers...")
years = ["2017", "2021", "2022", "2023"]
for year in years:
year_dir = organized_dir / year
if not year_dir.exists():
continue
index_file = year_dir / "INDEX.md"
if not index_file.exists():
continue
index_content = index_file.read_text()
# Check each HTML file in the year directory
for html_file in year_dir.glob("*.html"):
arxiv_id = html_file.stem
# Get actual authors from HTML
html_content = html_file.read_text()
authors = re.findall(r' 3:
if "et al." not in line:
print(f"❌ {year}/{arxiv_id}: Missing 'et al.' for {len(authors)} authors")
return False
elif "et al." in line:
print(f"❌ {year}/{arxiv_id}: Unexpected 'et al.' for {len(authors)} authors")
return False
# Verify no more than 3 authors are listed
author_count = author_parts.count(',') + 1 if author_parts.strip() else 0
if "et al." in author_parts:
author_count -= 1 # Don't count "et al." as an author
if author_count > 3:
print(f"❌ {year}/{arxiv_id}: More than 3 authors listed")
return False
break
print("✅ All papers respect the 3-author limit")
return True
def verify_summary_file(test_dir: Path) -> bool:
"""Verify SUMMARY.md exists and has correct content."""
summary_file = test_dir / "organized" / "SUMMARY.md"
if not summary_file.exists():
print("❌ SUMMARY.md not found")
return False
content = summary_file.read_text()
# Check for required columns
if "Year" not in content or "Paper Count" not in content or "Index Link" not in content:
print("❌ SUMMARY.md missing required columns")
return False
# Check for year entries
expected_years = ["2017", "2021", "2022", "2023"]
for year in expected_years:
if year not in content:
print(f"❌ SUMMARY.md missing year {year}")
return False
# Check for links to INDEX.md files
expected_links = [
f"{year}/INDEX.md" for year in expected_years
]
for link in expected_links:
if link not in content:
print(f"❌ SUMMARY.md missing link to {link}")
return False
# Check paper counts
expected_counts = {
"2017": 1,
"2021": 1,
"2022": 1,
"2023": 7
}
for year, count in expected_counts.items():
# Look for the row with this year
for line in content.split('\n'):
if f"| {year}" in line or f"|{year}" in line:
if str(count) not in line:
print(f"❌ SUMMARY.md has incorrect paper count for {year}")
return False
break
print("✅ SUMMARY.md has correct format and content")
return True
def verify_sorting(test_dir: Path) -> bool:
"""Verify that entries are sorted correctly."""
organized_dir = test_dir / "organized"
# Check SUMMARY.md year sorting
summary_file = organized_dir / "SUMMARY.md"
content = summary_file.read_text()
# Extract years from table rows
years_in_summary = []
for line in content.split('\n'):
if '|' in line and any(year in line for year in ["2017", "2021", "2022", "2023"]):
# Extract year from the line
for year in ["2017", "2021", "2022", "2023"]:
if year in line:
years_in_summary.append(year)
break
if years_in_summary != sorted(years_in_summary):
print(f"❌ SUMMARY.md years not sorted: {years_in_summary}")
return False
print("✅ SUMMARY.md years sorted correctly")
# Check INDEX.md arxiv ID sorting for one year
index_file = organized_dir / "2023" / "INDEX.md"
if index_file.exists():
content = index_file.read_text()
arxiv_ids = []
for line in content.split('\n'):
if '|' in line and '23' in line and 'ArXiv ID' not in line and '---' not in line:
# Extract arxiv ID
match = re.search(r'23\d{2}\.\d{5}', line)
if match:
arxiv_ids.append(match.group())
if arxiv_ids != sorted(arxiv_ids):
print(f"❌ INDEX.md arxiv IDs not sorted in 2023/")
return False
print("✅ INDEX.md entries sorted by arxiv ID")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Papers Collection Cleanup and Organization...")
# Define verification steps
verification_steps = [
("Papers Remain/Move Verification", verify_papers_remain),
("Directory Structure", verify_directory_structure),
("Papers Moved Correctly", verify_papers_moved),
("Index Files Format", verify_index_files),
("Author Extraction", verify_author_extraction),
("Summary File", verify_summary_file),
("Sorting Verification", verify_sorting),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
try:
if not verify_func(test_dir):
all_passed = False
except Exception as e:
print(f"❌ Error in {step_name}: {e}")
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Papers organized correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/student_database/duplicate_name/description.md
================================================
Please use FileSystem tools to finish the following task:
Please help me identify duplicate names from the list of all the 150 students. Do not use python code. Then generate a `namesake.txt` file to record the results in the following format, with each group written in three lines:
name: xxx
count: xxx
ids: xxx, xxx, ...
Leave one blank line between every two groups. If there are multiple duplicates, just list all corresponding IDs in the third line.
================================================
FILE: tasks/filesystem/standard/student_database/duplicate_name/meta.json
================================================
{
"task_id": "duplicate_name",
"task_name": "Duplicate Name",
"category_id": "student_database",
"category_name": "Student Database",
"description": "Identify students with identical names from a 150-student database and generate a formatted namesake grouping report file.",
"author": "Lingjun Chen",
"created_at": "2025-08-10",
"difficulty": "L3",
"tags": [
"pattern analysis",
"data extraction"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "student_database/\n ├── 20101250_Patricia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20101701_Isabella_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20102572_Michael_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104233_Robert_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104498_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104653_Sophia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104675_Michael_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104846_Christopher_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20107487_Mia_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20108742_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109144_Emma_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109803_Oliver_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20111634_Isabella_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20112439_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113368_William_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113603_Robert_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114397_Isabella_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114869_Ethan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115252_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115632_Elizabeth_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115753_Charlotte_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115924_Michael_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20116232_Olivia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20119528_Thomas_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122427_Karen_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122977_Evelyn_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20123376_Joseph_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20125451_Barbara_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126203_Barbara_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126394_Olivia_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126471_Ethan_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20127423_John_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128249_Oliver_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128879_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20129898_Jessica_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131271_Olivia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131518_Sophia_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132026_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132370_James_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132669_Noah_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133527_Mason_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133697_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20135821_Thomas_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136681_Benjamin_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136890_Benjamin_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20137514_Lucas_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139234_Harper_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139637_Noah_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139647_Patricia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20141421_Linda_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142085_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142383_Amelia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143406_Susan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143830_James_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146035_Christopher_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146277_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146279_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147301_James_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147789_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148681_John_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148778_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20149712_Jessica_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20151012_Harper_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153174_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153412_Charlotte_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153606_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153687_Richard_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154518_John_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154710_Benjamin_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156469_Jennifer_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156522_Jennifer_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156851_Noah_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20157943_Harper_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158266_Sophia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158294_Sophia_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158819_Sarah_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159113_John_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159695_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20161279_William_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162253_Mason_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162542_Mia_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20163356_Ava_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164515_Patricia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164801_Noah_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20165511_Mary_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166436_Christopher_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166487_Barbara_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166564_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166998_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168311_Lucas_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168491_Karen_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20169515_Thomas_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171050_Christopher_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171406_Mary_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171613_Ethan_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20172106_Isabella_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173259_Michael_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173492_Richard_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173501_Mary_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173517_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174207_Richard_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174369_Mary_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20175314_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176169_Lucas_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176947_Noah_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20177389_James_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20178687_Isabella_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179461_William_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179690_Linda_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20181056_Sarah_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182020_Patricia_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182390_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183149_David_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183219_Charlotte_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20184489_Jessica_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186154_Charlotte_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186510_James_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187107_David_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187144_Mary_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187892_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187921_Mary_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187967_Sarah_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20188937_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189123_Mary_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189192_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189268_Emma_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189854_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20191265_Joseph_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20192725_Robert_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194054_Michael_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194160_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194164_Sarah_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194525_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195164_Jennifer_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195982_David_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196776_William_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196896_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196961_Joseph_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196998_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20198548_Evelyn_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199036_Benjamin_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199583_Mary_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199735_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199872_Sophia_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199980_James_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201385_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201800_John_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20202548_Robert_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20203855_Mia_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n └── 20204611_Sarah_Wilson/\n ├── basic_info.txt\n └── recommendation_letter.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/student_database/duplicate_name/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Task: Find Duplicate Names
Simplified version that only checks against expected results without folder validation
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_namesake_file_exists(test_dir: Path) -> bool:
"""Verify that the namesake.txt file exists."""
namesake_file = test_dir / "namesake.txt"
if not namesake_file.exists():
print("❌ File 'namesake.txt' not found")
return False
print("✅ Namesake file found")
return True
def parse_namesake_file(test_dir: Path) -> dict:
"""Parse the namesake.txt file and return structured data."""
namesake_file = test_dir / "namesake.txt"
try:
content = namesake_file.read_text()
lines = content.strip().split('\n')
namesakes = {}
current_line = 0
while current_line < len(lines):
# Skip blank lines
if not lines[current_line].strip():
current_line += 1
continue
# Check if we have enough lines for a complete group
if current_line + 2 >= len(lines):
print(f"❌ Incomplete group at line {current_line + 1}")
return {}
# Parse group
name_line = lines[current_line].strip()
count_line = lines[current_line + 1].strip()
ids_line = lines[current_line + 2].strip()
# Extract name
if not name_line.startswith("name: "):
print(f"❌ Invalid name line format at line {current_line + 1}: {name_line}")
return {}
name = name_line.replace("name: ", "").strip()
# Extract count
if not count_line.startswith("count: "):
print(f"❌ Invalid count line format at line {current_line + 2}: {count_line}")
return {}
count_str = count_line.replace("count: ", "").strip()
try:
count = int(count_str)
except ValueError:
print(f"❌ Invalid count format: {count_str}")
return {}
# Extract IDs
if not ids_line.startswith("ids: "):
print(f"❌ Invalid ids line format at line {current_line + 3}: {ids_line}")
return {}
ids_str = ids_line.replace("ids: ", "").strip()
ids = [id.strip() for id in ids_str.split(",")]
namesakes[name] = {
'count': count,
'ids': ids
}
current_line += 4 # Skip to next group (after blank line)
return namesakes
except Exception as e:
print(f"❌ Error parsing namesake file: {e}")
return {}
def verify_against_expected_results(namesakes: dict) -> bool:
"""Verify that the results match the expected answer.md content exactly."""
# Expected duplicate names from answer.md (hardcoded)
expected_duplicates = {
'Isabella Smith': ['20132026', '20133697'],
'Ava Lopez': ['20166564', '20166998'],
'James Moore': ['20159695', '20188937'],
'William Taylor': ['20175314', '20189854'],
'Ethan Wilson': ['20182390', '20196998'],
'Christopher Taylor': ['20128879', '20187892'],
'William Anderson': ['20142085', '20146277'],
'James Anderson': ['20147789', '20153606'],
'Olivia Jones': ['20189192', '20196896'],
'Mason Johnson': ['20115252', '20199735'],
'Benjamin Jackson': ['20153174', '20194160'],
'John Taylor': ['20194525', '20201385'],
'Susan Anderson': ['20148778', '20173517'],
'Christopher Moore': ['20112439', '20146279'],
'Sarah Wilson': ['20158819', '20204611'],
'Sarah Brown': ['20104498', '20108742']
}
# Check if exactly 16 duplicate names are found
if len(namesakes) != 16:
print(f"❌ Expected exactly 16 duplicate names, but found {len(namesakes)}")
return False
# Check if all expected duplicate names are present
for expected_name in expected_duplicates:
if expected_name not in namesakes:
print(f"❌ Missing expected duplicate name: '{expected_name}'")
return False
# Check if all namesakes in the file are actually duplicates
for name, data in namesakes.items():
if name not in expected_duplicates:
print(f"❌ Unexpected duplicate name found: '{name}' (not in expected list)")
return False
expected_ids = set(expected_duplicates[name])
stated_ids = set(data['ids'])
if expected_ids != stated_ids:
print(f"❌ ID mismatch for '{name}':")
print(f" Expected: {sorted(expected_ids)}")
print(f" Stated: {sorted(stated_ids)}")
return False
# Verify count matches
if data['count'] != 2:
print(f"❌ Count mismatch for '{name}': expected 2, got {data['count']}")
return False
print("✅ All 16 expected duplicate names are correctly identified")
print("✅ All student IDs match expected results")
print("✅ All counts are correct (2 for each duplicate name)")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Student Database Task: Find Duplicate Names...")
# Check if namesake file exists
print("\n--- File Existence Check ---")
if not verify_namesake_file_exists(test_dir):
print("\n❌ Basic verification failed, cannot proceed with content verification")
sys.exit(1)
# Parse the file and run content verification
print("\n--- Content Verification ---")
namesakes = parse_namesake_file(test_dir)
if not namesakes:
print("❌ Failed to parse namesake file")
sys.exit(1)
# Verify against expected results
print("\n--- Results Verification ---")
if not verify_against_expected_results(namesakes):
print("\n❌ Task verification: FAIL")
sys.exit(1)
# Final result
print("\n" + "="*50)
print("✅ Namesake identification completed correctly!")
print(f"🎉 Found exactly {len(namesakes)} duplicate names (16 expected)")
print("🎉 Task verification: PASS")
sys.exit(0)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/student_database/english_talent/description.md
================================================
Please use FileSystem tools to finish the following task:
We are now recruiting students proficient in English to be responsible for the school’s English media operations. To contact with students, from the total of 150 students, select those who **meet both of the following criteria** :
1. Rated ****S** or** ****A** grade level in** `recommendation_letter.txt` by their teachers.
2. TOEFL score in the basic info is **higher than or equal to 100** .
Please compile all their names, ids and emails into a `qualified_students.txt` file, with the format:
name: xxx
id: xxx
email: xxx
Each person’s information should occupy three lines, with one blank line between each block.
================================================
FILE: tasks/filesystem/standard/student_database/english_talent/meta.json
================================================
{
"task_id": "english_talent",
"task_name": "English Talent",
"category_id": "student_database",
"category_name": "Student Database",
"description": "Select qualified students with S/A recommendation grades and TOEFL scores ≥100 for English media operations recruitment opportunities.",
"author": "Lingjun Chen",
"created_at": "2025-08-10",
"difficulty": "L3",
"tags": [
"data extraction",
"cross-referencing",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "student_database/\n ├── 20101250_Patricia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20101701_Isabella_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20102572_Michael_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104233_Robert_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104498_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104653_Sophia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104675_Michael_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104846_Christopher_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20107487_Mia_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20108742_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109144_Emma_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109803_Oliver_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20111634_Isabella_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20112439_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113368_William_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113603_Robert_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114397_Isabella_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114869_Ethan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115252_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115632_Elizabeth_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115753_Charlotte_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115924_Michael_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20116232_Olivia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20119528_Thomas_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122427_Karen_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122977_Evelyn_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20123376_Joseph_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20125451_Barbara_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126203_Barbara_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126394_Olivia_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126471_Ethan_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20127423_John_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128249_Oliver_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128879_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20129898_Jessica_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131271_Olivia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131518_Sophia_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132026_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132370_James_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132669_Noah_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133527_Mason_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133697_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20135821_Thomas_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136681_Benjamin_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136890_Benjamin_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20137514_Lucas_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139234_Harper_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139637_Noah_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139647_Patricia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20141421_Linda_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142085_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142383_Amelia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143406_Susan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143830_James_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146035_Christopher_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146277_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146279_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147301_James_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147789_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148681_John_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148778_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20149712_Jessica_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20151012_Harper_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153174_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153412_Charlotte_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153606_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153687_Richard_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154518_John_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154710_Benjamin_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156469_Jennifer_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156522_Jennifer_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156851_Noah_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20157943_Harper_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158266_Sophia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158294_Sophia_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158819_Sarah_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159113_John_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159695_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20161279_William_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162253_Mason_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162542_Mia_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20163356_Ava_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164515_Patricia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164801_Noah_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20165511_Mary_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166436_Christopher_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166487_Barbara_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166564_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166998_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168311_Lucas_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168491_Karen_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20169515_Thomas_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171050_Christopher_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171406_Mary_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171613_Ethan_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20172106_Isabella_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173259_Michael_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173492_Richard_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173501_Mary_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173517_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174207_Richard_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174369_Mary_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20175314_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176169_Lucas_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176947_Noah_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20177389_James_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20178687_Isabella_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179461_William_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179690_Linda_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20181056_Sarah_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182020_Patricia_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182390_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183149_David_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183219_Charlotte_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20184489_Jessica_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186154_Charlotte_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186510_James_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187107_David_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187144_Mary_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187892_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187921_Mary_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187967_Sarah_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20188937_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189123_Mary_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189192_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189268_Emma_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189854_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20191265_Joseph_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20192725_Robert_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194054_Michael_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194160_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194164_Sarah_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194525_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195164_Jennifer_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195982_David_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196776_William_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196896_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196961_Joseph_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196998_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20198548_Evelyn_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199036_Benjamin_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199583_Mary_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199735_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199872_Sophia_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199980_James_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201385_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201800_John_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20202548_Robert_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20203855_Mia_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n └── 20204611_Sarah_Wilson/\n ├── basic_info.txt\n └── recommendation_letter.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/student_database/english_talent/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Task: English Talent Recruitment
"""
import sys
from pathlib import Path
import re
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_qualified_students_file_exists(test_dir: Path) -> bool:
"""Verify that the qualified_students.txt file exists."""
answer_file = test_dir / "qualified_students.txt"
if not answer_file.exists():
print("❌ File 'qualified_students.txt' not found")
return False
print("✅ Qualified students file found")
return True
def verify_file_format(test_dir: Path) -> bool:
"""Verify that the qualified_students.txt file has the correct format."""
answer_file = test_dir / "qualified_students.txt"
try:
content = answer_file.read_text()
lines = content.strip().split('\n')
if not lines:
print("❌ File is empty")
return False
# Check if content follows the expected pattern
# Each student should have 3 lines: name, id, email
# Students should be separated by blank lines
current_line = 0
student_count = 0
while current_line < len(lines):
# Skip blank lines
if not lines[current_line].strip():
current_line += 1
continue
# Check if we have enough lines for a complete student
if current_line + 2 >= len(lines):
print(f"❌ Incomplete student entry at line {current_line + 1}")
return False
# Verify name line format
if not lines[current_line].strip().startswith("name: "):
print(f"❌ Invalid name line format at line {current_line + 1}: {lines[current_line]}")
return False
# Verify id line format
if not lines[current_line + 1].strip().startswith("id: "):
print(f"❌ Invalid id line format at line {current_line + 2}: {lines[current_line + 1]}")
return False
# Verify email line format
if not lines[current_line + 2].strip().startswith("email: "):
print(f"❌ Invalid email line format at line {current_line + 3}: {lines[current_line + 2]}")
return False
student_count += 1
current_line += 3
# Check for blank line separator (except for the last student)
if current_line < len(lines) and lines[current_line].strip():
print(f"❌ Missing blank line separator after student {student_count}")
return False
current_line += 1
if student_count == 0:
print("❌ No valid student entries found")
return False
print(f"✅ File format is correct with {student_count} students")
return True
except Exception as e:
print(f"❌ Error reading qualified students file: {e}")
return False
def parse_qualified_students_file(test_dir: Path) -> list:
"""Parse the qualified_students.txt file and return structured data."""
answer_file = test_dir / "qualified_students.txt"
try:
content = answer_file.read_text()
lines = content.strip().split('\n')
students = []
current_line = 0
while current_line < len(lines):
# Skip blank lines
if not lines[current_line].strip():
current_line += 1
continue
# Parse student entry
name_line = lines[current_line].strip()
id_line = lines[current_line + 1].strip()
email_line = lines[current_line + 2].strip()
# Extract name
name = name_line.replace("name: ", "").strip()
# Extract id
student_id = id_line.replace("id: ", "").strip()
# Extract email
email = email_line.replace("email: ", "").strip()
students.append({
'name': name,
'id': student_id,
'email': email
})
current_line += 4 # Skip to next student (after blank line)
return students
except Exception as e:
print(f"❌ Error parsing qualified students file: {e}")
return []
def verify_student_count(students: list) -> bool:
"""Verify that exactly 19 students are found."""
expected_count = 19
actual_count = len(students)
if actual_count != expected_count:
print(f"❌ Expected {expected_count} students, but found {actual_count}")
return False
print(f"✅ Found exactly {expected_count} students")
return True
def verify_expected_students(students: list) -> bool:
"""Verify that all expected students are present with correct details."""
# Expected students from answer.md
expected_students = {
'James Smith': {'id': '20177389', 'email': 'james.smith30@outlook.com'},
'Ava Lopez': {'id': '20166998', 'email': 'ava.lopez67@outlook.com'},
'James Anderson': {'id': '20153606', 'email': 'james.anderson71@yahoo.com'},
'Benjamin Anderson': {'id': '20136681', 'email': 'benjamin.anderson37@qq.com'},
'Sarah Wilson': {'id': '20158819', 'email': 'sarah.wilson96@outlook.com'},
'Isabella Davis': {'id': '20101701', 'email': 'isabella.davis89@gmail.com'},
'James Moore': {'id': '20188937', 'email': 'james.moore62@gmail.com'},
'Harper Williams': {'id': '20157943', 'email': 'harper.williams38@163.com'},
'Noah Smith': {'id': '20132669', 'email': 'noah.smith45@163.com'},
'Emma Thomas': {'id': '20109144', 'email': 'emma.thomas68@163.com'},
'Mary Brown': {'id': '20199583', 'email': 'mary.brown27@yahoo.com'},
'John Jones': {'id': '20201800', 'email': 'john.jones46@gmail.com'},
'Mia Anderson': {'id': '20162542', 'email': 'mia.anderson3@outlook.com'},
'Barbara Davis': {'id': '20126203', 'email': 'barbara.davis67@163.com'},
'Thomas Brown': {'id': '20119528', 'email': 'thomas.brown43@163.com'},
'Susan Anderson': {'id': '20148778', 'email': 'susan.anderson16@163.com'},
'Mary Garcia': {'id': '20174369', 'email': 'mary.garcia58@gmail.com'},
'Richard Wilson': {'id': '20174207', 'email': 'richard.wilson39@outlook.com'},
'Joseph Lopez': {'id': '20191265', 'email': 'joseph.lopez93@yahoo.com'}
}
# Check if all expected students are present
found_students = set()
for student in students:
found_students.add(student['name'])
missing_students = set(expected_students.keys()) - found_students
if missing_students:
print(f"❌ Missing expected students: {missing_students}")
return False
# Check if all found students are expected
unexpected_students = found_students - set(expected_students.keys())
if unexpected_students:
print(f"❌ Unexpected students found: {unexpected_students}")
return False
# Check if student details match exactly
for student in students:
expected = expected_students[student['name']]
if student['id'] != expected['id']:
print(f"❌ ID mismatch for {student['name']}: expected {expected['id']}, got {student['id']}")
return False
if student['email'] != expected['email']:
print(f"❌ Email mismatch for {student['name']}: expected {expected['email']}, got {student['email']}")
return False
print("✅ All expected students are present with correct details")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Student Database Task: English Talent Recruitment...")
# Define verification steps
verification_steps = [
("Qualified Students File Exists", verify_qualified_students_file_exists),
("File Format", verify_file_format),
]
# Run basic verification steps first
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
break
if not all_passed:
print("\n❌ Basic verification failed, cannot proceed with content verification")
sys.exit(1)
# Parse the file and run content verification
print("\n--- Content Verification ---")
students = parse_qualified_students_file(test_dir)
if not students:
print("❌ Failed to parse qualified students file")
sys.exit(1)
content_verification_steps = [
("Student Count", lambda: verify_student_count(students)),
("Expected Students", lambda: verify_expected_students(students)),
]
for step_name, verify_func in content_verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func():
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ English talent recruitment completed correctly!")
print(f"🎉 Found exactly {len(students)} qualified students")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/student_database/gradebased_score/description.md
================================================
Please use FileSystem tools to finish the following task:
### Simple Grade Calculation
1. Read Student Data:
* Process all student basic_info.txt files from the database
* Extract scores for Chinese, Math, and English subjects
2. Calculate Basic Grades:
* Use simple grade scale: A (90+), B (80-89), C (70-79), D (60-69), F (<60)
* Apply this same scale to all subjects
### Generate Output Files
1. Create student_grades.csv:
* Columns: student_id, name, chinese_score, chinese_grade, math_score, math_grade, english_score, english_grade
* Must contain exactly each students
* Each students one row
2. Create grade_summary.txt:
* Total number of students processed
* Number of A's, B's, C's, D's, and F's for each subject
* Simple count of students with passing grades (A, B, C) vs failing grades (D, F) for each subjects
================================================
FILE: tasks/filesystem/standard/student_database/gradebased_score/meta.json
================================================
{
"task_id": "gradebased_score",
"task_name": "Gradebased Score",
"category_id": "student_database",
"category_name": "Student Database",
"description": "Process student numerical scores to calculate letter grades using A-F scale and produce comprehensive grade distribution analysis reports.",
"author": "Lingjun Chen",
"created_at": "2025-08-10",
"difficulty": "L3",
"tags": [
"data extraction",
"content transformation",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "student_database/\n ├── 20101250_Patricia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20101701_Isabella_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20102572_Michael_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104233_Robert_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104498_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104653_Sophia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104675_Michael_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20104846_Christopher_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20107487_Mia_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20108742_Sarah_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109144_Emma_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20109803_Oliver_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20111634_Isabella_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20112439_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113368_William_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20113603_Robert_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114397_Isabella_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20114869_Ethan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115252_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115632_Elizabeth_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115753_Charlotte_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20115924_Michael_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20116232_Olivia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20119528_Thomas_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122427_Karen_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20122977_Evelyn_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20123376_Joseph_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20125451_Barbara_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126203_Barbara_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126394_Olivia_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20126471_Ethan_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20127423_John_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128249_Oliver_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20128879_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20129898_Jessica_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131271_Olivia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20131518_Sophia_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132026_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132370_James_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20132669_Noah_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133527_Mason_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20133697_Isabella_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20135821_Thomas_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136681_Benjamin_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20136890_Benjamin_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20137514_Lucas_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139234_Harper_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139637_Noah_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20139647_Patricia_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20141421_Linda_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142085_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20142383_Amelia_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143406_Susan_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20143830_James_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146035_Christopher_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146277_William_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20146279_Christopher_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147301_James_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20147789_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148681_John_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20148778_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20149712_Jessica_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20151012_Harper_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153174_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153412_Charlotte_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153606_James_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20153687_Richard_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154518_John_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20154710_Benjamin_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156469_Jennifer_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156522_Jennifer_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20156851_Noah_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20157943_Harper_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158266_Sophia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158294_Sophia_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20158819_Sarah_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159113_John_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20159695_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20161279_William_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162253_Mason_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20162542_Mia_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20163356_Ava_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164515_Patricia_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20164801_Noah_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20165511_Mary_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166436_Christopher_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166487_Barbara_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166564_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20166998_Ava_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168311_Lucas_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20168491_Karen_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20169515_Thomas_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171050_Christopher_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171406_Mary_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20171613_Ethan_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20172106_Isabella_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173259_Michael_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173492_Richard_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173501_Mary_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20173517_Susan_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174207_Richard_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20174369_Mary_Garcia/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20175314_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176169_Lucas_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20176947_Noah_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20177389_James_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20178687_Isabella_Anderson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179461_William_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20179690_Linda_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20181056_Sarah_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182020_Patricia_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20182390_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183149_David_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20183219_Charlotte_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20184489_Jessica_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186154_Charlotte_Smith/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20186510_James_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187107_David_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187144_Mary_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187892_Christopher_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187921_Mary_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20187967_Sarah_Davis/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20188937_James_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189123_Mary_Martin/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189192_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189268_Emma_Williams/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20189854_William_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20191265_Joseph_Lopez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20192725_Robert_Martinez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194054_Michael_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194160_Benjamin_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194164_Sarah_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20194525_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195164_Jennifer_Gonzalez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20195982_David_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196776_William_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196896_Olivia_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196961_Joseph_Thomas/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20196998_Ethan_Wilson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20198548_Evelyn_Moore/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199036_Benjamin_Hernandez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199583_Mary_Brown/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199735_Mason_Johnson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199872_Sophia_Jackson/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20199980_James_Rodriguez/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201385_John_Taylor/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20201800_John_Jones/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20202548_Robert_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n ├── 20203855_Mia_Miller/\n │ ├── basic_info.txt\n │ └── recommendation_letter.txt\n └── 20204611_Sarah_Wilson/\n ├── basic_info.txt\n └── recommendation_letter.txt",
"stateUrl": "https://storage.mcpmark.ai/filesystem/student_database.zip",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/filesystem/standard/student_database/gradebased_score/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Student Database Grade-Based Score Analysis Task
"""
import sys
from pathlib import Path
import os
import re
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_grade_summary_exists(test_dir: Path) -> bool:
"""Verify that grade_summary.txt file exists."""
grade_summary_file = test_dir / "grade_summary.txt"
if not grade_summary_file.exists():
print("❌ File 'grade_summary.txt' not found")
return False
print("✅ grade_summary.txt file found")
return True
def verify_grade_summary_readable(test_dir: Path) -> bool:
"""Verify that the grade_summary.txt file is readable."""
grade_summary_file = test_dir / "grade_summary.txt"
try:
content = grade_summary_file.read_text()
if not content.strip():
print("❌ grade_summary.txt file is empty")
return False
print("✅ grade_summary.txt file is readable")
return True
except Exception as e:
print(f"❌ Error reading grade_summary.txt file: {e}")
return False
def extract_numbers_from_text(text: str) -> list:
"""Extract all numbers from text."""
numbers = re.findall(r'\d+', text)
return [int(num) for num in numbers]
def verify_three_subjects_present(test_dir: Path) -> bool:
"""Verify that grade_summary.txt contains all three subjects (case insensitive)."""
grade_summary_file = test_dir / "grade_summary.txt"
try:
content = grade_summary_file.read_text()
# Check if all three subjects are mentioned (case insensitive)
subjects = ["chinese", "math", "english"]
missing_subjects = []
for subject in subjects:
if subject.lower() not in content.lower():
missing_subjects.append(subject)
if missing_subjects:
print(f"❌ Missing subjects in grade_summary.txt: {missing_subjects}")
return False
print("✅ All three subjects (Chinese, Math, English) found in grade_summary.txt")
return True
except Exception as e:
print(f"❌ Error checking subjects: {e}")
return False
def verify_grade_summary_content(test_dir: Path) -> bool:
"""Verify that grade_summary.txt contains the correct statistics from answer.md."""
grade_summary_file = test_dir / "grade_summary.txt"
try:
content = grade_summary_file.read_text()
# Extract all numbers from the content
found_numbers = extract_numbers_from_text(content)
if not found_numbers:
print("❌ No numbers found in grade_summary.txt")
return False
# Expected numbers from answer.md
# Format: [total_students, chinese_A, chinese_B, chinese_C, chinese_D, chinese_pass, chinese_fail,
# math_A, math_B, math_C, math_D, math_pass, math_fail,
# english_A, english_B, english_C, english_D, english_F, english_pass, english_fail]
expected_numbers = [
# Total students
150,
# Chinese grades: A(42), B(37), C(43), D(28), Pass(122), Fail(28)
42, 37, 43, 28, 122, 28,
# Math grades: A(31), B(38), C(47), D(34), Pass(116), Fail(34)
31, 38, 47, 34, 116, 34,
# English grades: A(32), B(38), C(38), D(41), F(1), Pass(108), Fail(42)
32, 38, 38, 41, 1, 108, 42
]
# Check if all expected numbers are present in the found numbers
missing_numbers = []
for expected in expected_numbers:
if expected not in found_numbers:
missing_numbers.append(expected)
if missing_numbers:
print(f"❌ Missing expected numbers: {missing_numbers}")
print(f" Found numbers: {found_numbers}")
return False
# Check if the counts match (each number should appear the expected number of times)
for expected in expected_numbers:
expected_count = expected_numbers.count(expected)
found_count = found_numbers.count(expected)
if found_count < expected_count:
print(f"❌ Number {expected} appears {found_count} times, expected {expected_count} times")
return False
print("✅ All expected grade statistics found in grade_summary.txt")
return True
except Exception as e:
print(f"❌ Error verifying grade summary content: {e}")
return False
def main():
"""Main verification function."""
try:
test_dir = get_test_directory()
print(f"🔍 Verifying Student Database Grade-Based Score Analysis in: {test_dir}")
# Define verification steps
verification_steps = [
("Grade Summary File Exists", verify_grade_summary_exists),
("File is Readable", verify_grade_summary_readable),
("Three Subjects Present", verify_three_subjects_present),
("Grade Statistics Content", verify_grade_summary_content),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Student grade analysis completed correctly!")
print("🎉 Grade-Based Score Analysis verification: PASS")
sys.exit(0)
else:
print("❌ Grade-Based Score Analysis verification: FAIL")
sys.exit(1)
except Exception as e:
print(f"❌ Verification failed with error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/threestudio/code_locating/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
ThreeStudio is a comprehensive codebase that implements various diffusion-based text-to-3D models, including NeRF-based rendering stage and diffusion guidance stage. Your task is to explore the codebase and identify the specific file that defines the guidance functionality for the Zero123 model.
### Task Objectives
1. **Explore the ThreeStudio codebase** using filesystem MCP tools
2. **Search through the project structure** to understand the codebase organization
3. **Identify the file** that contains the Zero123 guidance implementation
4. **Create an answer file** with the correct file path
### Expected Output
Create a file named `answer.txt` in the test directory root
**Requirements:**
- Only include the file path, no additional text or explanation
- Use forward slashes (/) for path separators
- Include the full relative path from the project root
- Ensure the path points to the actual file that defines Zero123 guidance
================================================
FILE: tasks/filesystem/standard/threestudio/code_locating/meta.json
================================================
{
"task_id": "code_locating",
"task_name": "Code Locating",
"category_id": "threestudio",
"category_name": "Threestudio",
"description": "Navigate the ThreeStudio codebase to locate and identify the specific file that defines Zero123 guidance functionality implementation.",
"author": "Lingjun Chen",
"created_at": "2025-08-05",
"difficulty": "L3",
"tags": [
"code exploration"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "threestudio/\n ├── configs/\n │ ├── debugging/\n │ │ ├── controlnet-canny.yaml\n │ │ ├── controlnet-normal.yaml\n │ │ ├── instructpix2pix.yaml\n │ │ └── stablediffusion.yaml\n │ ├── experimental/\n │ │ ├── unified-guidance/\n │ │ │ ├── dreamfusion-sd.yaml\n │ │ │ ├── hifa.yaml\n │ │ │ ├── prolificdreamer-hifa.yaml\n │ │ │ ├── prolificdreamer.yaml\n │ │ │ └── zero123-simple.yaml\n │ │ ├── co3d-imagecondition.yaml\n │ │ ├── imagecondition.yaml\n │ │ ├── imagecondition_zero123nerf.yaml\n │ │ ├── imagecondition_zero123nerf_refine.yaml\n │ │ ├── prolificdreamer-importance.yaml\n │ │ ├── prolificdreamer-neus-importance.yaml\n │ │ ├── prolificdreamer-propnet.yaml\n │ │ └── textmesh-if-importance.yaml\n │ ├── gradio/\n │ │ ├── dreamfusion-if.yaml\n │ │ ├── dreamfusion-sd.yaml\n │ │ ├── fantasia3d.yaml\n │ │ ├── latentnerf.yaml\n │ │ ├── sjc.yaml\n │ │ └── textmesh-if.yaml\n │ ├── control4d-static.yaml\n │ ├── dreamfusion-if.yaml\n │ ├── dreamfusion-sd-eff.yaml\n │ ├── dreamfusion-sd.yaml\n │ ├── fantasia3d-texture.yaml\n │ ├── fantasia3d.yaml\n │ ├── hifa.yaml\n │ ├── instructnerf2nerf.yaml\n │ ├── latentnerf-refine.yaml\n │ ├── latentnerf.yaml\n │ ├── magic123-coarse-sd.yaml\n │ ├── magic123-hifa-coarse-sd.yaml\n │ ├── magic123-hifa-refine-sd.yaml\n │ ├── magic123-refine-sd.yaml\n │ ├── magic3d-coarse-if.yaml\n │ ├── magic3d-coarse-sd.yaml\n │ ├── magic3d-refine-sd.yaml\n │ ├── prolificdreamer-geometry.yaml\n │ ├── prolificdreamer-hifa.yaml\n │ ├── prolificdreamer-patch.yaml\n │ ├── prolificdreamer-scene-hifa.yaml\n │ ├── prolificdreamer-scene.yaml\n │ ├── prolificdreamer-texture.yaml\n │ ├── prolificdreamer.yaml\n │ ├── sdi.yaml\n │ ├── sjc.yaml\n │ ├── sketchshape-refine.yaml\n │ ├── sketchshape.yaml\n │ ├── stable-zero123.yaml\n │ ├── textmesh-if.yaml\n │ ├── zero123-geometry.yaml\n │ └── zero123.yaml\n ├── custom/\n │ └── put_custom_extensions_here\n ├── docker/\n │ ├── compose.yaml\n │ └── Dockerfile\n ├── docs/\n │ └── installation.md\n ├── extern/\n │ ├── ldm_zero123/\n │ │ ├── models/\n │ │ │ ├── diffusion/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── classifier.py\n │ │ │ │ ├── ddim.py\n │ │ │ │ ├── ddpm.py\n │ │ │ │ ├── plms.py\n │ │ │ │ └── sampling_util.py\n │ │ │ └── autoencoder.py\n │ │ ├── modules/\n │ │ │ ├── diffusionmodules/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── model.py\n │ │ │ │ ├── openaimodel.py\n │ │ │ │ └── util.py\n │ │ │ ├── distributions/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── distributions.py\n │ │ │ ├── encoders/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── modules.py\n │ │ │ ├── evaluate/\n │ │ │ │ ├── adm_evaluator.py\n │ │ │ │ ├── evaluate_perceptualsim.py\n │ │ │ │ ├── frechet_video_distance.py\n │ │ │ │ ├── ssim.py\n │ │ │ │ └── torch_frechet_video_distance.py\n │ │ │ ├── image_degradation/\n │ │ │ │ ├── utils/\n │ │ │ │ │ └── test.png\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── bsrgan.py\n │ │ │ │ ├── bsrgan_light.py\n │ │ │ │ └── utils_image.py\n │ │ │ ├── losses/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── contperceptual.py\n │ │ │ │ └── vqperceptual.py\n │ │ │ ├── attention.py\n │ │ │ ├── ema.py\n │ │ │ └── x_transformer.py\n │ │ ├── thirdp/\n │ │ │ └── psp/\n │ │ │ ├── helpers.py\n │ │ │ ├── id_loss.py\n │ │ │ └── model_irse.py\n │ │ ├── __init__.py\n │ │ ├── extras.py\n │ │ ├── guidance.py\n │ │ ├── lr_scheduler.py\n │ │ └── util.py\n │ ├── __init__.py\n │ └── zero123.py\n ├── load/\n │ ├── images/\n │ │ ├── anya_front.png\n │ │ ├── anya_front_depth.png\n │ │ ├── anya_front_normal.png\n │ │ ├── anya_front_rgba.png\n │ │ ├── baby_phoenix_on_ice.png\n │ │ ├── baby_phoenix_on_ice_depth.png\n │ │ ├── baby_phoenix_on_ice_normal.png\n │ │ ├── baby_phoenix_on_ice_rgba.png\n │ │ ├── beach_house_1.png\n │ │ ├── beach_house_1_depth.png\n │ │ ├── beach_house_1_normal.png\n │ │ ├── beach_house_1_rgba.png\n │ │ ├── beach_house_2.png\n │ │ ├── beach_house_2_depth.png\n │ │ ├── beach_house_2_normal.png\n │ │ ├── beach_house_2_rgba.png\n │ │ ├── bollywood_actress.png\n │ │ ├── bollywood_actress_depth.png\n │ │ ├── bollywood_actress_normal.png\n │ │ ├── bollywood_actress_rgba.png\n │ │ ├── cactus.png\n │ │ ├── cactus_depth.png\n │ │ ├── cactus_normal.png\n │ │ ├── cactus_rgba.png\n │ │ ├── catstatue.png\n │ │ ├── catstatue_depth.png\n │ │ ├── catstatue_normal.png\n │ │ ├── catstatue_rgba.png\n │ │ ├── church_ruins.png\n │ │ ├── church_ruins_depth.png\n │ │ ├── church_ruins_normal.png\n │ │ ├── church_ruins_rgba.png\n │ │ ├── dog1_rgba.png\n │ │ ├── dragon2_rgba.png\n │ │ ├── firekeeper.jpg\n │ │ ├── firekeeper_depth.png\n │ │ ├── firekeeper_normal.png\n │ │ ├── firekeeper_rgba.png\n │ │ ├── futuristic_car.png\n │ │ ├── futuristic_car_depth.png\n │ │ ├── futuristic_car_normal.png\n │ │ ├── futuristic_car_rgba.png\n │ │ ├── grootplant_rgba.png\n │ │ ├── hamburger.png\n │ │ ├── hamburger_depth.png\n │ │ ├── hamburger_rgba.png\n │ │ ├── mona_lisa.png\n │ │ ├── mona_lisa_depth.png\n │ │ ├── mona_lisa_normal.png\n │ │ ├── mona_lisa_rgba.png\n │ │ ├── robot_rgba.png\n │ │ ├── teddy.png\n │ │ ├── teddy_depth.png\n │ │ ├── teddy_normal.png\n │ │ ├── teddy_rgba.png\n │ │ └── thorhammer_rgba.png\n │ ├── lights/\n │ │ ├── bsdf_256_256.bin\n │ │ ├── LICENSE.txt\n │ │ └── mud_road_puresky_1k.hdr\n │ ├── shapes/\n │ │ ├── animal.obj\n │ │ ├── blub.obj\n │ │ ├── cabin.obj\n │ │ ├── env_sphere.obj\n │ │ ├── hand_prismatic.obj\n │ │ ├── human.obj\n │ │ ├── nascar.obj\n │ │ ├── potion.obj\n │ │ ├── README.md\n │ │ └── teddy.obj\n │ ├── tets/\n │ │ ├── 128_tets.npz\n │ │ ├── 32_tets.npz\n │ │ ├── 64_tets.npz\n │ │ └── generate_tets.py\n │ ├── zero123/\n │ │ ├── download.sh\n │ │ └── sd-objaverse-finetune-c_concat-256.yaml\n │ ├── make_prompt_library.py\n │ └── prompt_library.json\n ├── scripts/\n │ └── convert_zero123_to_diffusers.py\n ├── threestudio/\n │ ├── data/\n │ │ ├── __init__.py\n │ │ ├── co3d.py\n │ │ ├── image.py\n │ │ ├── multiview.py\n │ │ ├── uncond.py\n │ │ └── uncond_eff.py\n │ ├── models/\n │ │ ├── background/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── neural_environment_map_background.py\n │ │ │ ├── solid_color_background.py\n │ │ │ └── textured_background.py\n │ │ ├── exporters/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ └── mesh_exporter.py\n │ │ ├── geometry/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── custom_mesh.py\n │ │ │ ├── implicit_sdf.py\n │ │ │ ├── implicit_volume.py\n │ │ │ ├── tetrahedra_sdf_grid.py\n │ │ │ └── volume_grid.py\n │ │ ├── guidance/\n │ │ │ ├── __init__.py\n │ │ │ ├── controlnet_guidance.py\n │ │ │ ├── deep_floyd_guidance.py\n │ │ │ ├── instructpix2pix_guidance.py\n │ │ │ ├── stable_diffusion_guidance.py\n │ │ │ ├── stable_diffusion_sdi_guidance.py\n │ │ │ ├── stable_diffusion_unified_guidance.py\n │ │ │ ├── stable_diffusion_vsd_guidance.py\n │ │ │ ├── stable_zero123_guidance.py\n │ │ │ ├── zero123_guidance.py\n │ │ │ └── zero123_unified_guidance.py\n │ │ ├── materials/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── diffuse_with_point_light_material.py\n │ │ │ ├── hybrid_rgb_latent_material.py\n │ │ │ ├── neural_radiance_material.py\n │ │ │ ├── no_material.py\n │ │ │ ├── pbr_material.py\n │ │ │ └── sd_latent_adapter_material.py\n │ │ ├── prompt_processors/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deepfloyd_prompt_processor.py\n │ │ │ ├── dummy_prompt_processor.py\n │ │ │ └── stable_diffusion_prompt_processor.py\n │ │ ├── renderers/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deferred_volume_renderer.py\n │ │ │ ├── gan_volume_renderer.py\n │ │ │ ├── nerf_volume_renderer.py\n │ │ │ ├── neus_volume_renderer.py\n │ │ │ ├── nvdiff_rasterizer.py\n │ │ │ └── patch_renderer.py\n │ │ ├── __init__.py\n │ │ ├── estimators.py\n │ │ ├── isosurface.py\n │ │ ├── mesh.py\n │ │ └── networks.py\n │ ├── scripts/\n │ │ ├── make_training_vid.py\n │ │ ├── run_zero123.sh\n │ │ ├── run_zero123_comparison.sh\n │ │ ├── run_zero123_phase.sh\n │ │ ├── run_zero123_phase2.sh\n │ │ ├── run_zero123_sbatch.py\n │ │ ├── zero123_demo.py\n │ │ └── zero123_sbatch.sh\n │ ├── systems/\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── control4d_multiview.py\n │ │ ├── dreamfusion.py\n │ │ ├── eff_dreamfusion.py\n │ │ ├── fantasia3d.py\n │ │ ├── imagedreamfusion.py\n │ │ ├── instructnerf2nerf.py\n │ │ ├── latentnerf.py\n │ │ ├── magic123.py\n │ │ ├── magic3d.py\n │ │ ├── optimizers.py\n │ │ ├── prolificdreamer.py\n │ │ ├── sdi.py\n │ │ ├── sjc.py\n │ │ ├── textmesh.py\n │ │ ├── utils.py\n │ │ ├── zero123.py\n │ │ └── zero123_simple.py\n │ ├── utils/\n │ │ ├── GAN/\n │ │ │ ├── __init__.py\n │ │ │ ├── attention.py\n │ │ │ ├── discriminator.py\n │ │ │ ├── distribution.py\n │ │ │ ├── loss.py\n │ │ │ ├── mobilenet.py\n │ │ │ ├── network_util.py\n │ │ │ ├── util.py\n │ │ │ └── vae.py\n │ │ ├── perceptual/\n │ │ │ ├── __init__.py\n │ │ │ ├── perceptual.py\n │ │ │ └── utils.py\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── callbacks.py\n │ │ ├── config.py\n │ │ ├── loss.py\n │ │ ├── misc.py\n │ │ ├── ops.py\n │ │ ├── rasterize.py\n │ │ ├── saving.py\n │ │ └── typing.py\n │ └── __init__.py\n ├── .editorconfig\n ├── .pre-commit-config.yaml\n ├── .pylintrc\n ├── 2dplayground.ipynb\n ├── 2dplayground_SDI_version.ipynb\n ├── CHANGELOG.md\n ├── DOCUMENTATION.md\n ├── gradio_app.py\n ├── launch.py\n ├── LICENSE\n ├── README.md\n ├── requirements-dev.txt\n ├── requirements.txt\n ├── setup.py\n └── threestudio.ipynb",
"stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip",
"stateOriginalUrl": "https://github.com/threestudio-project/threestudio"
}
}
================================================
FILE: tasks/filesystem/standard/threestudio/code_locating/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for ThreeStudio Task 1: Find Zero123 Guidance Implementation
"""
import sys
from pathlib import Path
import re
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_answer_file_exists(test_dir: Path) -> bool:
"""Verify that the answer.txt file exists."""
answer_file = test_dir / "answer.txt"
if not answer_file.exists():
print("❌ File 'answer.txt' not found")
return False
print("✅ Answer file found")
return True
def verify_answer_format(test_dir: Path) -> bool:
"""Verify that the answer file has the correct format."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# Check if content is not empty
if not content:
print("❌ Answer file is empty")
return False
# Check if it contains only the file path (no additional text)
if len(content.split('\n')) > 1:
print("❌ Answer file contains multiple lines or additional text")
return False
# Check if it uses forward slashes
if '\\' in content:
print("❌ Answer uses backslashes instead of forward slashes")
return False
# Check if it's a relative path
if content.startswith('/') or ':' in content:
print("❌ Answer appears to be an absolute path")
return False
print("✅ Answer format is correct")
return True
except Exception as e:
print(f"❌ Error reading answer file: {e}")
return False
def verify_file_path_structure(test_dir: Path) -> bool:
"""Verify that the file path has the expected structure."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# Expected path components for Zero123 guidance
# In backup directories, the path is threestudio/models/guidance/zero123_guidance.py
# In test_environments, the path is threestudio/threestudio/models/guidance/zero123_guidance.py
expected_components = ["threestudio", "models", "guidance", "zero123_guidance.py"]
# Check if all expected components are in the path
for component in expected_components:
if component not in content:
print(f"❌ Path missing expected component: {component}")
return False
print("✅ File path structure is correct")
return True
except Exception as e:
print(f"❌ Error verifying file path structure: {e}")
return False
def verify_file_exists(test_dir: Path) -> bool:
"""Verify that the identified file actually exists."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# Try the path as provided in the answer file
file_path = test_dir / content
# If that doesn't exist, try with the correct path structure
# The answer file might have threestudio/models/guidance/zero123_guidance.py
# but the actual path is threestudio/threestudio/models/guidance/zero123_guidance.py
if not file_path.exists():
# Try to fix the path by adding the missing threestudio prefix
if content.startswith("threestudio/models/"):
corrected_path = content.replace("threestudio/models/", "threestudio/threestudio/models/")
file_path = test_dir / corrected_path
if file_path.exists():
print(f"✅ File exists with corrected path: {corrected_path}")
return True
if not file_path.exists():
print(f"❌ Identified file does not exist: {content}")
return False
print("✅ Identified file exists")
return True
except Exception as e:
print(f"❌ Error verifying file existence: {e}")
return False
def verify_zero123_guidance_content(test_dir: Path) -> bool:
"""Verify that the identified file actually contains Zero123 guidance implementation."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# Try the path as provided in the answer file
file_path = test_dir / content
# If that doesn't exist, try with the correct path structure
if not file_path.exists():
# Try to fix the path by adding the missing threestudio prefix
if content.startswith("threestudio/models/"):
corrected_path = content.replace("threestudio/models/", "threestudio/threestudio/models/")
file_path = test_dir / corrected_path
if not file_path.exists():
print(f"❌ Cannot find file for content verification: {content}")
return False
file_content = file_path.read_text()
# Check for the main Zero123 guidance implementation
# The main implementation should have the class name "Zero123Guidance" and register as "zero123-guidance"
main_zero123_indicators = [
r'class Zero123Guidance', # Main class name
r'@threestudio\.register\("zero123-guidance"\)', # Correct registration
r'BaseObject', # Base class
r'zero123', # General zero123 reference
]
found_indicators = []
for indicator in main_zero123_indicators:
if re.search(indicator, file_content, re.IGNORECASE):
found_indicators.append(indicator)
# Check if this is the main Zero123 guidance implementation
is_main_implementation = (
'class Zero123Guidance' in file_content and
'@threestudio.register("zero123-guidance")' in file_content
)
if not is_main_implementation:
print(f"❌ File is not the main Zero123 guidance implementation")
print(f" Expected: class Zero123Guidance and @threestudio.register('zero123-guidance')")
return False
print(f"✅ File contains main Zero123 guidance implementation indicators: {found_indicators}")
return True
except Exception as e:
print(f"❌ Error verifying file content: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying ThreeStudio Task 1: Find Zero123 Guidance Implementation...")
# Define verification steps
verification_steps = [
("Answer File Exists", verify_answer_file_exists),
("Answer Format", verify_answer_format),
("File Path Structure", verify_file_path_structure),
("File Exists", verify_file_exists),
("Zero123 Guidance Content", verify_zero123_guidance_content),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Zero123 guidance file path identified correctly!")
print("🎉 Task 1 verification: PASS")
sys.exit(0)
else:
print("❌ Task 1 verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/threestudio/output_analysis/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
ThreeStudio is a comprehensive codebase that implements various diffusion-based text-to-3D models, including NeRF-based rendering stage and diffusion guidance stage. Your task is to explore the codebase and identify the specific file that defines the guidance functionality for the Zero123 model.
### Task
What is the output of `guidance_out`, returned by the code at line 137 in `threestudio/systems/zero123.py`?
Clearly state the structure of it and where you find the answer (file and line numbers).Write your answer in a file named `answer.txt` in the test directory root. Do not add extra explanation or formatting beyond what is required by the task.
================================================
FILE: tasks/filesystem/standard/threestudio/output_analysis/meta.json
================================================
{
"task_id": "output_analysis",
"task_name": "Output Analysis",
"category_id": "threestudio",
"category_name": "Threestudio",
"description": "Analyze the structure and components of guidance_out object returned by Zero123 guidance code at line 137 for understanding output format.",
"author": "Lingjun Chen",
"created_at": "2025-08-05",
"difficulty": "L3",
"tags": [
"code exploration",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "threestudio/\n ├── configs/\n │ ├── debugging/\n │ │ ├── controlnet-canny.yaml\n │ │ ├── controlnet-normal.yaml\n │ │ ├── instructpix2pix.yaml\n │ │ └── stablediffusion.yaml\n │ ├── experimental/\n │ │ ├── unified-guidance/\n │ │ │ ├── dreamfusion-sd.yaml\n │ │ │ ├── hifa.yaml\n │ │ │ ├── prolificdreamer-hifa.yaml\n │ │ │ ├── prolificdreamer.yaml\n │ │ │ └── zero123-simple.yaml\n │ │ ├── co3d-imagecondition.yaml\n │ │ ├── imagecondition.yaml\n │ │ ├── imagecondition_zero123nerf.yaml\n │ │ ├── imagecondition_zero123nerf_refine.yaml\n │ │ ├── prolificdreamer-importance.yaml\n │ │ ├── prolificdreamer-neus-importance.yaml\n │ │ ├── prolificdreamer-propnet.yaml\n │ │ └── textmesh-if-importance.yaml\n │ ├── gradio/\n │ │ ├── dreamfusion-if.yaml\n │ │ ├── dreamfusion-sd.yaml\n │ │ ├── fantasia3d.yaml\n │ │ ├── latentnerf.yaml\n │ │ ├── sjc.yaml\n │ │ └── textmesh-if.yaml\n │ ├── control4d-static.yaml\n │ ├── dreamfusion-if.yaml\n │ ├── dreamfusion-sd-eff.yaml\n │ ├── dreamfusion-sd.yaml\n │ ├── fantasia3d-texture.yaml\n │ ├── fantasia3d.yaml\n │ ├── hifa.yaml\n │ ├── instructnerf2nerf.yaml\n │ ├── latentnerf-refine.yaml\n │ ├── latentnerf.yaml\n │ ├── magic123-coarse-sd.yaml\n │ ├── magic123-hifa-coarse-sd.yaml\n │ ├── magic123-hifa-refine-sd.yaml\n │ ├── magic123-refine-sd.yaml\n │ ├── magic3d-coarse-if.yaml\n │ ├── magic3d-coarse-sd.yaml\n │ ├── magic3d-refine-sd.yaml\n │ ├── prolificdreamer-geometry.yaml\n │ ├── prolificdreamer-hifa.yaml\n │ ├── prolificdreamer-patch.yaml\n │ ├── prolificdreamer-scene-hifa.yaml\n │ ├── prolificdreamer-scene.yaml\n │ ├── prolificdreamer-texture.yaml\n │ ├── prolificdreamer.yaml\n │ ├── sdi.yaml\n │ ├── sjc.yaml\n │ ├── sketchshape-refine.yaml\n │ ├── sketchshape.yaml\n │ ├── stable-zero123.yaml\n │ ├── textmesh-if.yaml\n │ ├── zero123-geometry.yaml\n │ └── zero123.yaml\n ├── custom/\n │ └── put_custom_extensions_here\n ├── docker/\n │ ├── compose.yaml\n │ └── Dockerfile\n ├── docs/\n │ └── installation.md\n ├── extern/\n │ ├── ldm_zero123/\n │ │ ├── models/\n │ │ │ ├── diffusion/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── classifier.py\n │ │ │ │ ├── ddim.py\n │ │ │ │ ├── ddpm.py\n │ │ │ │ ├── plms.py\n │ │ │ │ └── sampling_util.py\n │ │ │ └── autoencoder.py\n │ │ ├── modules/\n │ │ │ ├── diffusionmodules/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── model.py\n │ │ │ │ ├── openaimodel.py\n │ │ │ │ └── util.py\n │ │ │ ├── distributions/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── distributions.py\n │ │ │ ├── encoders/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── modules.py\n │ │ │ ├── evaluate/\n │ │ │ │ ├── adm_evaluator.py\n │ │ │ │ ├── evaluate_perceptualsim.py\n │ │ │ │ ├── frechet_video_distance.py\n │ │ │ │ ├── ssim.py\n │ │ │ │ └── torch_frechet_video_distance.py\n │ │ │ ├── image_degradation/\n │ │ │ │ ├── utils/\n │ │ │ │ │ └── test.png\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── bsrgan.py\n │ │ │ │ ├── bsrgan_light.py\n │ │ │ │ └── utils_image.py\n │ │ │ ├── losses/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── contperceptual.py\n │ │ │ │ └── vqperceptual.py\n │ │ │ ├── attention.py\n │ │ │ ├── ema.py\n │ │ │ └── x_transformer.py\n │ │ ├── thirdp/\n │ │ │ └── psp/\n │ │ │ ├── helpers.py\n │ │ │ ├── id_loss.py\n │ │ │ └── model_irse.py\n │ │ ├── __init__.py\n │ │ ├── extras.py\n │ │ ├── guidance.py\n │ │ ├── lr_scheduler.py\n │ │ └── util.py\n │ ├── __init__.py\n │ └── zero123.py\n ├── load/\n │ ├── images/\n │ │ ├── anya_front.png\n │ │ ├── anya_front_depth.png\n │ │ ├── anya_front_normal.png\n │ │ ├── anya_front_rgba.png\n │ │ ├── baby_phoenix_on_ice.png\n │ │ ├── baby_phoenix_on_ice_depth.png\n │ │ ├── baby_phoenix_on_ice_normal.png\n │ │ ├── baby_phoenix_on_ice_rgba.png\n │ │ ├── beach_house_1.png\n │ │ ├── beach_house_1_depth.png\n │ │ ├── beach_house_1_normal.png\n │ │ ├── beach_house_1_rgba.png\n │ │ ├── beach_house_2.png\n │ │ ├── beach_house_2_depth.png\n │ │ ├── beach_house_2_normal.png\n │ │ ├── beach_house_2_rgba.png\n │ │ ├── bollywood_actress.png\n │ │ ├── bollywood_actress_depth.png\n │ │ ├── bollywood_actress_normal.png\n │ │ ├── bollywood_actress_rgba.png\n │ │ ├── cactus.png\n │ │ ├── cactus_depth.png\n │ │ ├── cactus_normal.png\n │ │ ├── cactus_rgba.png\n │ │ ├── catstatue.png\n │ │ ├── catstatue_depth.png\n │ │ ├── catstatue_normal.png\n │ │ ├── catstatue_rgba.png\n │ │ ├── church_ruins.png\n │ │ ├── church_ruins_depth.png\n │ │ ├── church_ruins_normal.png\n │ │ ├── church_ruins_rgba.png\n │ │ ├── dog1_rgba.png\n │ │ ├── dragon2_rgba.png\n │ │ ├── firekeeper.jpg\n │ │ ├── firekeeper_depth.png\n │ │ ├── firekeeper_normal.png\n │ │ ├── firekeeper_rgba.png\n │ │ ├── futuristic_car.png\n │ │ ├── futuristic_car_depth.png\n │ │ ├── futuristic_car_normal.png\n │ │ ├── futuristic_car_rgba.png\n │ │ ├── grootplant_rgba.png\n │ │ ├── hamburger.png\n │ │ ├── hamburger_depth.png\n │ │ ├── hamburger_rgba.png\n │ │ ├── mona_lisa.png\n │ │ ├── mona_lisa_depth.png\n │ │ ├── mona_lisa_normal.png\n │ │ ├── mona_lisa_rgba.png\n │ │ ├── robot_rgba.png\n │ │ ├── teddy.png\n │ │ ├── teddy_depth.png\n │ │ ├── teddy_normal.png\n │ │ ├── teddy_rgba.png\n │ │ └── thorhammer_rgba.png\n │ ├── lights/\n │ │ ├── bsdf_256_256.bin\n │ │ ├── LICENSE.txt\n │ │ └── mud_road_puresky_1k.hdr\n │ ├── shapes/\n │ │ ├── animal.obj\n │ │ ├── blub.obj\n │ │ ├── cabin.obj\n │ │ ├── env_sphere.obj\n │ │ ├── hand_prismatic.obj\n │ │ ├── human.obj\n │ │ ├── nascar.obj\n │ │ ├── potion.obj\n │ │ ├── README.md\n │ │ └── teddy.obj\n │ ├── tets/\n │ │ ├── 128_tets.npz\n │ │ ├── 32_tets.npz\n │ │ ├── 64_tets.npz\n │ │ └── generate_tets.py\n │ ├── zero123/\n │ │ ├── download.sh\n │ │ └── sd-objaverse-finetune-c_concat-256.yaml\n │ ├── make_prompt_library.py\n │ └── prompt_library.json\n ├── scripts/\n │ └── convert_zero123_to_diffusers.py\n ├── threestudio/\n │ ├── data/\n │ │ ├── __init__.py\n │ │ ├── co3d.py\n │ │ ├── image.py\n │ │ ├── multiview.py\n │ │ ├── uncond.py\n │ │ └── uncond_eff.py\n │ ├── models/\n │ │ ├── background/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── neural_environment_map_background.py\n │ │ │ ├── solid_color_background.py\n │ │ │ └── textured_background.py\n │ │ ├── exporters/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ └── mesh_exporter.py\n │ │ ├── geometry/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── custom_mesh.py\n │ │ │ ├── implicit_sdf.py\n │ │ │ ├── implicit_volume.py\n │ │ │ ├── tetrahedra_sdf_grid.py\n │ │ │ └── volume_grid.py\n │ │ ├── guidance/\n │ │ │ ├── __init__.py\n │ │ │ ├── controlnet_guidance.py\n │ │ │ ├── deep_floyd_guidance.py\n │ │ │ ├── instructpix2pix_guidance.py\n │ │ │ ├── stable_diffusion_guidance.py\n │ │ │ ├── stable_diffusion_sdi_guidance.py\n │ │ │ ├── stable_diffusion_unified_guidance.py\n │ │ │ ├── stable_diffusion_vsd_guidance.py\n │ │ │ ├── stable_zero123_guidance.py\n │ │ │ ├── zero123_guidance.py\n │ │ │ └── zero123_unified_guidance.py\n │ │ ├── materials/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── diffuse_with_point_light_material.py\n │ │ │ ├── hybrid_rgb_latent_material.py\n │ │ │ ├── neural_radiance_material.py\n │ │ │ ├── no_material.py\n │ │ │ ├── pbr_material.py\n │ │ │ └── sd_latent_adapter_material.py\n │ │ ├── prompt_processors/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deepfloyd_prompt_processor.py\n │ │ │ ├── dummy_prompt_processor.py\n │ │ │ └── stable_diffusion_prompt_processor.py\n │ │ ├── renderers/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deferred_volume_renderer.py\n │ │ │ ├── gan_volume_renderer.py\n │ │ │ ├── nerf_volume_renderer.py\n │ │ │ ├── neus_volume_renderer.py\n │ │ │ ├── nvdiff_rasterizer.py\n │ │ │ └── patch_renderer.py\n │ │ ├── __init__.py\n │ │ ├── estimators.py\n │ │ ├── isosurface.py\n │ │ ├── mesh.py\n │ │ └── networks.py\n │ ├── scripts/\n │ │ ├── make_training_vid.py\n │ │ ├── run_zero123.sh\n │ │ ├── run_zero123_comparison.sh\n │ │ ├── run_zero123_phase.sh\n │ │ ├── run_zero123_phase2.sh\n │ │ ├── run_zero123_sbatch.py\n │ │ ├── zero123_demo.py\n │ │ └── zero123_sbatch.sh\n │ ├── systems/\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── control4d_multiview.py\n │ │ ├── dreamfusion.py\n │ │ ├── eff_dreamfusion.py\n │ │ ├── fantasia3d.py\n │ │ ├── imagedreamfusion.py\n │ │ ├── instructnerf2nerf.py\n │ │ ├── latentnerf.py\n │ │ ├── magic123.py\n │ │ ├── magic3d.py\n │ │ ├── optimizers.py\n │ │ ├── prolificdreamer.py\n │ │ ├── sdi.py\n │ │ ├── sjc.py\n │ │ ├── textmesh.py\n │ │ ├── utils.py\n │ │ ├── zero123.py\n │ │ └── zero123_simple.py\n │ ├── utils/\n │ │ ├── GAN/\n │ │ │ ├── __init__.py\n │ │ │ ├── attention.py\n │ │ │ ├── discriminator.py\n │ │ │ ├── distribution.py\n │ │ │ ├── loss.py\n │ │ │ ├── mobilenet.py\n │ │ │ ├── network_util.py\n │ │ │ ├── util.py\n │ │ │ └── vae.py\n │ │ ├── perceptual/\n │ │ │ ├── __init__.py\n │ │ │ ├── perceptual.py\n │ │ │ └── utils.py\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── callbacks.py\n │ │ ├── config.py\n │ │ ├── loss.py\n │ │ ├── misc.py\n │ │ ├── ops.py\n │ │ ├── rasterize.py\n │ │ ├── saving.py\n │ │ └── typing.py\n │ └── __init__.py\n ├── .editorconfig\n ├── .pre-commit-config.yaml\n ├── .pylintrc\n ├── 2dplayground.ipynb\n ├── 2dplayground_SDI_version.ipynb\n ├── CHANGELOG.md\n ├── DOCUMENTATION.md\n ├── gradio_app.py\n ├── launch.py\n ├── LICENSE\n ├── README.md\n ├── requirements-dev.txt\n ├── requirements.txt\n ├── setup.py\n └── threestudio.ipynb",
"stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip",
"stateOriginalUrl": "https://github.com/threestudio-project/threestudio"
}
}
================================================
FILE: tasks/filesystem/standard/threestudio/output_analysis/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for ThreeStudio Task 2: Analyze Zero123 Guidance Output Structure
"""
import sys
from pathlib import Path
import re
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_answer_file_exists(test_dir: Path) -> bool:
"""Verify that the answer.txt file exists."""
answer_file = test_dir / "answer.txt"
if not answer_file.exists():
print("❌ File 'answer.txt' not found")
return False
print("✅ Answer file found")
return True
def verify_required_strings(test_dir: Path) -> bool:
"""Verify that the answer contains the four required strings."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text()
# Check for required strings
required_strings = ["loss_sds", "grad_norm", "min_step", "max_step"]
missing_strings = []
for string in required_strings:
if string not in content:
missing_strings.append(string)
if missing_strings:
print(f"❌ Missing required strings: {missing_strings}")
return False
print("✅ All required strings found")
return True
except Exception as e:
print(f"❌ Error reading answer file: {e}")
return False
def verify_line_numbers(test_dir: Path) -> bool:
"""Verify that line numbers contain (323 or 324) AND (327 or 328)."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text()
# Check for first number (323 or 324)
has_first = "323" in content or "324" in content
# Check for second number (327 or 328)
has_second = "327" in content or "328" in content
if not has_first:
print("❌ Missing first line number (323 or 324)")
return False
if not has_second:
print("❌ Missing second line number (327 or 328)")
return False
print("✅ Line numbers found: contains (323 or 324) and (327 or 328)")
return True
except Exception as e:
print(f"❌ Error verifying line numbers: {e}")
return False
def verify_file_path(test_dir: Path) -> bool:
"""Verify that the file path contains the exact expected path string."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text()
# Check for the exact expected file path
expected_path = "threestudio/models/guidance/zero123_guidance.py"
if expected_path not in content:
print(f"❌ Missing expected file path: {expected_path}")
return False
print("✅ File path found: threestudio/models/guidance/zero123_guidance.py")
return True
except Exception as e:
print(f"❌ Error verifying file path: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying ThreeStudio Task 2: Analyze Zero123 Guidance Output Structure...")
# Define verification steps
verification_steps = [
("Answer File Exists", verify_answer_file_exists),
("Required Strings", verify_required_strings),
("Line Numbers Range", verify_line_numbers),
("File Path Components", verify_file_path),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Zero123 guidance output structure analyzed correctly!")
print("🎉 Task 2 verification: PASS")
sys.exit(0)
else:
print("❌ Task 2 verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/threestudio/requirements_completion/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
The `requirements.txt` file in the ThreeStudio project is used to install necessary Python libraries. However, the Zero123-related dependencies were accidentally deleted from the file. Your task is to restore these missing dependencies.
### Task Objectives
1. **Locate the requirements.txt file** in the test environment
2. **Identify the missing Zero123 dependencies** that need to be restored
3. **Add the required dependencies** to the requirements.txt file
4. **Ensure the file format is correct** (one dependency per line)
================================================
FILE: tasks/filesystem/standard/threestudio/requirements_completion/meta.json
================================================
{
"task_id": "requirements_completion",
"task_name": "Requirements Completion",
"category_id": "threestudio",
"category_name": "Threestudio",
"description": "Restore and complete missing Zero123-related dependencies in the requirements.txt file to ensure proper ThreeStudio project configuration.",
"author": "Lingjun Chen",
"created_at": "2025-08-05",
"difficulty": "L3",
"tags": [
"code exploration",
"cross-referencing"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "threestudio/\n ├── configs/\n │ ├── debugging/\n │ │ ├── controlnet-canny.yaml\n │ │ ├── controlnet-normal.yaml\n │ │ ├── instructpix2pix.yaml\n │ │ └── stablediffusion.yaml\n │ ├── experimental/\n │ │ ├── unified-guidance/\n │ │ │ ├── dreamfusion-sd.yaml\n │ │ │ ├── hifa.yaml\n │ │ │ ├── prolificdreamer-hifa.yaml\n │ │ │ ├── prolificdreamer.yaml\n │ │ │ └── zero123-simple.yaml\n │ │ ├── co3d-imagecondition.yaml\n │ │ ├── imagecondition.yaml\n │ │ ├── imagecondition_zero123nerf.yaml\n │ │ ├── imagecondition_zero123nerf_refine.yaml\n │ │ ├── prolificdreamer-importance.yaml\n │ │ ├── prolificdreamer-neus-importance.yaml\n │ │ ├── prolificdreamer-propnet.yaml\n │ │ └── textmesh-if-importance.yaml\n │ ├── gradio/\n │ │ ├── dreamfusion-if.yaml\n │ │ ├── dreamfusion-sd.yaml\n │ │ ├── fantasia3d.yaml\n │ │ ├── latentnerf.yaml\n │ │ ├── sjc.yaml\n │ │ └── textmesh-if.yaml\n │ ├── control4d-static.yaml\n │ ├── dreamfusion-if.yaml\n │ ├── dreamfusion-sd-eff.yaml\n │ ├── dreamfusion-sd.yaml\n │ ├── fantasia3d-texture.yaml\n │ ├── fantasia3d.yaml\n │ ├── hifa.yaml\n │ ├── instructnerf2nerf.yaml\n │ ├── latentnerf-refine.yaml\n │ ├── latentnerf.yaml\n │ ├── magic123-coarse-sd.yaml\n │ ├── magic123-hifa-coarse-sd.yaml\n │ ├── magic123-hifa-refine-sd.yaml\n │ ├── magic123-refine-sd.yaml\n │ ├── magic3d-coarse-if.yaml\n │ ├── magic3d-coarse-sd.yaml\n │ ├── magic3d-refine-sd.yaml\n │ ├── prolificdreamer-geometry.yaml\n │ ├── prolificdreamer-hifa.yaml\n │ ├── prolificdreamer-patch.yaml\n │ ├── prolificdreamer-scene-hifa.yaml\n │ ├── prolificdreamer-scene.yaml\n │ ├── prolificdreamer-texture.yaml\n │ ├── prolificdreamer.yaml\n │ ├── sdi.yaml\n │ ├── sjc.yaml\n │ ├── sketchshape-refine.yaml\n │ ├── sketchshape.yaml\n │ ├── stable-zero123.yaml\n │ ├── textmesh-if.yaml\n │ ├── zero123-geometry.yaml\n │ └── zero123.yaml\n ├── custom/\n │ └── put_custom_extensions_here\n ├── docker/\n │ ├── compose.yaml\n │ └── Dockerfile\n ├── docs/\n │ └── installation.md\n ├── extern/\n │ ├── ldm_zero123/\n │ │ ├── models/\n │ │ │ ├── diffusion/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── classifier.py\n │ │ │ │ ├── ddim.py\n │ │ │ │ ├── ddpm.py\n │ │ │ │ ├── plms.py\n │ │ │ │ └── sampling_util.py\n │ │ │ └── autoencoder.py\n │ │ ├── modules/\n │ │ │ ├── diffusionmodules/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── model.py\n │ │ │ │ ├── openaimodel.py\n │ │ │ │ └── util.py\n │ │ │ ├── distributions/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── distributions.py\n │ │ │ ├── encoders/\n │ │ │ │ ├── __init__.py\n │ │ │ │ └── modules.py\n │ │ │ ├── evaluate/\n │ │ │ │ ├── adm_evaluator.py\n │ │ │ │ ├── evaluate_perceptualsim.py\n │ │ │ │ ├── frechet_video_distance.py\n │ │ │ │ ├── ssim.py\n │ │ │ │ └── torch_frechet_video_distance.py\n │ │ │ ├── image_degradation/\n │ │ │ │ ├── utils/\n │ │ │ │ │ └── test.png\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── bsrgan.py\n │ │ │ │ ├── bsrgan_light.py\n │ │ │ │ └── utils_image.py\n │ │ │ ├── losses/\n │ │ │ │ ├── __init__.py\n │ │ │ │ ├── contperceptual.py\n │ │ │ │ └── vqperceptual.py\n │ │ │ ├── attention.py\n │ │ │ ├── ema.py\n │ │ │ └── x_transformer.py\n │ │ ├── thirdp/\n │ │ │ └── psp/\n │ │ │ ├── helpers.py\n │ │ │ ├── id_loss.py\n │ │ │ └── model_irse.py\n │ │ ├── __init__.py\n │ │ ├── extras.py\n │ │ ├── guidance.py\n │ │ ├── lr_scheduler.py\n │ │ └── util.py\n │ ├── __init__.py\n │ └── zero123.py\n ├── load/\n │ ├── images/\n │ │ ├── anya_front.png\n │ │ ├── anya_front_depth.png\n │ │ ├── anya_front_normal.png\n │ │ ├── anya_front_rgba.png\n │ │ ├── baby_phoenix_on_ice.png\n │ │ ├── baby_phoenix_on_ice_depth.png\n │ │ ├── baby_phoenix_on_ice_normal.png\n │ │ ├── baby_phoenix_on_ice_rgba.png\n │ │ ├── beach_house_1.png\n │ │ ├── beach_house_1_depth.png\n │ │ ├── beach_house_1_normal.png\n │ │ ├── beach_house_1_rgba.png\n │ │ ├── beach_house_2.png\n │ │ ├── beach_house_2_depth.png\n │ │ ├── beach_house_2_normal.png\n │ │ ├── beach_house_2_rgba.png\n │ │ ├── bollywood_actress.png\n │ │ ├── bollywood_actress_depth.png\n │ │ ├── bollywood_actress_normal.png\n │ │ ├── bollywood_actress_rgba.png\n │ │ ├── cactus.png\n │ │ ├── cactus_depth.png\n │ │ ├── cactus_normal.png\n │ │ ├── cactus_rgba.png\n │ │ ├── catstatue.png\n │ │ ├── catstatue_depth.png\n │ │ ├── catstatue_normal.png\n │ │ ├── catstatue_rgba.png\n │ │ ├── church_ruins.png\n │ │ ├── church_ruins_depth.png\n │ │ ├── church_ruins_normal.png\n │ │ ├── church_ruins_rgba.png\n │ │ ├── dog1_rgba.png\n │ │ ├── dragon2_rgba.png\n │ │ ├── firekeeper.jpg\n │ │ ├── firekeeper_depth.png\n │ │ ├── firekeeper_normal.png\n │ │ ├── firekeeper_rgba.png\n │ │ ├── futuristic_car.png\n │ │ ├── futuristic_car_depth.png\n │ │ ├── futuristic_car_normal.png\n │ │ ├── futuristic_car_rgba.png\n │ │ ├── grootplant_rgba.png\n │ │ ├── hamburger.png\n │ │ ├── hamburger_depth.png\n │ │ ├── hamburger_rgba.png\n │ │ ├── mona_lisa.png\n │ │ ├── mona_lisa_depth.png\n │ │ ├── mona_lisa_normal.png\n │ │ ├── mona_lisa_rgba.png\n │ │ ├── robot_rgba.png\n │ │ ├── teddy.png\n │ │ ├── teddy_depth.png\n │ │ ├── teddy_normal.png\n │ │ ├── teddy_rgba.png\n │ │ └── thorhammer_rgba.png\n │ ├── lights/\n │ │ ├── bsdf_256_256.bin\n │ │ ├── LICENSE.txt\n │ │ └── mud_road_puresky_1k.hdr\n │ ├── shapes/\n │ │ ├── animal.obj\n │ │ ├── blub.obj\n │ │ ├── cabin.obj\n │ │ ├── env_sphere.obj\n │ │ ├── hand_prismatic.obj\n │ │ ├── human.obj\n │ │ ├── nascar.obj\n │ │ ├── potion.obj\n │ │ ├── README.md\n │ │ └── teddy.obj\n │ ├── tets/\n │ │ ├── 128_tets.npz\n │ │ ├── 32_tets.npz\n │ │ ├── 64_tets.npz\n │ │ └── generate_tets.py\n │ ├── zero123/\n │ │ ├── download.sh\n │ │ └── sd-objaverse-finetune-c_concat-256.yaml\n │ ├── make_prompt_library.py\n │ └── prompt_library.json\n ├── scripts/\n │ └── convert_zero123_to_diffusers.py\n ├── threestudio/\n │ ├── data/\n │ │ ├── __init__.py\n │ │ ├── co3d.py\n │ │ ├── image.py\n │ │ ├── multiview.py\n │ │ ├── uncond.py\n │ │ └── uncond_eff.py\n │ ├── models/\n │ │ ├── background/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── neural_environment_map_background.py\n │ │ │ ├── solid_color_background.py\n │ │ │ └── textured_background.py\n │ │ ├── exporters/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ └── mesh_exporter.py\n │ │ ├── geometry/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── custom_mesh.py\n │ │ │ ├── implicit_sdf.py\n │ │ │ ├── implicit_volume.py\n │ │ │ ├── tetrahedra_sdf_grid.py\n │ │ │ └── volume_grid.py\n │ │ ├── guidance/\n │ │ │ ├── __init__.py\n │ │ │ ├── controlnet_guidance.py\n │ │ │ ├── deep_floyd_guidance.py\n │ │ │ ├── instructpix2pix_guidance.py\n │ │ │ ├── stable_diffusion_guidance.py\n │ │ │ ├── stable_diffusion_sdi_guidance.py\n │ │ │ ├── stable_diffusion_unified_guidance.py\n │ │ │ ├── stable_diffusion_vsd_guidance.py\n │ │ │ ├── stable_zero123_guidance.py\n │ │ │ ├── zero123_guidance.py\n │ │ │ └── zero123_unified_guidance.py\n │ │ ├── materials/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── diffuse_with_point_light_material.py\n │ │ │ ├── hybrid_rgb_latent_material.py\n │ │ │ ├── neural_radiance_material.py\n │ │ │ ├── no_material.py\n │ │ │ ├── pbr_material.py\n │ │ │ └── sd_latent_adapter_material.py\n │ │ ├── prompt_processors/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deepfloyd_prompt_processor.py\n │ │ │ ├── dummy_prompt_processor.py\n │ │ │ └── stable_diffusion_prompt_processor.py\n │ │ ├── renderers/\n │ │ │ ├── __init__.py\n │ │ │ ├── base.py\n │ │ │ ├── deferred_volume_renderer.py\n │ │ │ ├── gan_volume_renderer.py\n │ │ │ ├── nerf_volume_renderer.py\n │ │ │ ├── neus_volume_renderer.py\n │ │ │ ├── nvdiff_rasterizer.py\n │ │ │ └── patch_renderer.py\n │ │ ├── __init__.py\n │ │ ├── estimators.py\n │ │ ├── isosurface.py\n │ │ ├── mesh.py\n │ │ └── networks.py\n │ ├── scripts/\n │ │ ├── make_training_vid.py\n │ │ ├── run_zero123.sh\n │ │ ├── run_zero123_comparison.sh\n │ │ ├── run_zero123_phase.sh\n │ │ ├── run_zero123_phase2.sh\n │ │ ├── run_zero123_sbatch.py\n │ │ ├── zero123_demo.py\n │ │ └── zero123_sbatch.sh\n │ ├── systems/\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── control4d_multiview.py\n │ │ ├── dreamfusion.py\n │ │ ├── eff_dreamfusion.py\n │ │ ├── fantasia3d.py\n │ │ ├── imagedreamfusion.py\n │ │ ├── instructnerf2nerf.py\n │ │ ├── latentnerf.py\n │ │ ├── magic123.py\n │ │ ├── magic3d.py\n │ │ ├── optimizers.py\n │ │ ├── prolificdreamer.py\n │ │ ├── sdi.py\n │ │ ├── sjc.py\n │ │ ├── textmesh.py\n │ │ ├── utils.py\n │ │ ├── zero123.py\n │ │ └── zero123_simple.py\n │ ├── utils/\n │ │ ├── GAN/\n │ │ │ ├── __init__.py\n │ │ │ ├── attention.py\n │ │ │ ├── discriminator.py\n │ │ │ ├── distribution.py\n │ │ │ ├── loss.py\n │ │ │ ├── mobilenet.py\n │ │ │ ├── network_util.py\n │ │ │ ├── util.py\n │ │ │ └── vae.py\n │ │ ├── perceptual/\n │ │ │ ├── __init__.py\n │ │ │ ├── perceptual.py\n │ │ │ └── utils.py\n │ │ ├── __init__.py\n │ │ ├── base.py\n │ │ ├── callbacks.py\n │ │ ├── config.py\n │ │ ├── loss.py\n │ │ ├── misc.py\n │ │ ├── ops.py\n │ │ ├── rasterize.py\n │ │ ├── saving.py\n │ │ └── typing.py\n │ └── __init__.py\n ├── .editorconfig\n ├── .pre-commit-config.yaml\n ├── .pylintrc\n ├── 2dplayground.ipynb\n ├── 2dplayground_SDI_version.ipynb\n ├── CHANGELOG.md\n ├── DOCUMENTATION.md\n ├── gradio_app.py\n ├── launch.py\n ├── LICENSE\n ├── README.md\n ├── requirements-dev.txt\n ├── requirements.txt\n ├── setup.py\n └── threestudio.ipynb",
"stateUrl": "https://storage.mcpmark.ai/filesystem/threestudio.zip",
"stateOriginalUrl": "https://github.com/threestudio-project/threestudio"
}
}
================================================
FILE: tasks/filesystem/standard/threestudio/requirements_completion/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for ThreeStudio Task 3: Restore Zero123 Dependencies in Requirements.txt
"""
import sys
from pathlib import Path
import re
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_requirements_file_exists(test_dir: Path) -> bool:
"""Verify that the requirements.txt file exists."""
requirements_file = test_dir / "requirements.txt"
if not requirements_file.exists():
print("❌ File 'requirements.txt' not found")
return False
print("✅ Requirements.txt file found")
return True
def verify_requirements_file_readable(test_dir: Path) -> bool:
"""Verify that the requirements.txt file is readable."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
if not content.strip():
print("❌ Requirements.txt file is empty")
return False
print("✅ Requirements.txt file is readable")
return True
except Exception as e:
print(f"❌ Error reading requirements.txt file: {e}")
return False
def verify_required_dependencies_present(test_dir: Path) -> bool:
"""Verify that all required Zero123 dependencies are present."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
# Required dependencies to check for (simplified)
required_deps = [
"einops",
"kornia",
"taming",
"openai",
"clip"
]
missing_deps = []
found_deps = []
for dep in required_deps:
if dep.lower() in content.lower():
found_deps.append(dep)
else:
missing_deps.append(dep)
if missing_deps:
print(f"❌ Missing required dependencies: {missing_deps}")
return False
print(f"✅ All required dependencies found: {found_deps}")
return True
except Exception as e:
print(f"❌ Error checking dependencies: {e}")
return False
def verify_specific_dependency_entries(test_dir: Path) -> bool:
"""Verify that the specific dependency entries are present."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
# Check for specific dependency entries (simplified)
# For taming, we only need to check if "taming" is present, not the full package name
required_checks = [
("einops", "einops"),
("kornia", "kornia"),
("taming", "taming"), # Just check for "taming" substring
]
missing_entries = []
found_entries = []
for check_name, full_entry in required_checks:
if check_name in content.lower():
found_entries.append(check_name)
else:
missing_entries.append(check_name)
# Special check for openai and clip - they should be on the same line
lines = content.split('\n')
openai_clip_found = False
for line in lines:
line_lower = line.lower()
if "openai" in line_lower and "clip" in line_lower:
openai_clip_found = True
break
if openai_clip_found:
found_entries.append("openai+clip")
else:
missing_entries.append("openai+clip")
if missing_entries:
print(f"❌ Missing required dependency checks: {missing_entries}")
return False
print(f"✅ All required dependency checks passed: {found_entries}")
return True
except Exception as e:
print(f"❌ Error checking specific entries: {e}")
return False
def verify_file_format(test_dir: Path) -> bool:
"""Verify that the requirements.txt file has proper format."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
lines = content.split('\n')
# Basic format check - just ensure file is not completely empty
if not content.strip():
print("❌ File is completely empty")
return False
print("✅ File format is acceptable")
return True
except Exception as e:
print(f"❌ Error checking file format: {e}")
return False
def verify_no_duplicate_entries(test_dir: Path) -> bool:
"""Verify that there are no duplicate dependency entries."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
# Simplified duplicate check - just ensure the file is not completely corrupted
if len(content) < 10: # Basic sanity check
print("❌ File seems too short to be valid")
return False
print("✅ File appears to be valid")
return True
except Exception as e:
print(f"❌ Error checking file: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying ThreeStudio Task 3: Restore Zero123 Dependencies in Requirements.txt...")
# Define verification steps
verification_steps = [
("Requirements File Exists", verify_requirements_file_exists),
("File is Readable", verify_requirements_file_readable),
("Required Dependencies Present", verify_required_dependencies_present),
("Specific Entries Present", verify_specific_dependency_entries),
("File Format", verify_file_format),
("File Validity", verify_no_duplicate_entries),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Zero123 dependencies successfully restored in requirements.txt!")
print("🎉 Task 3 verification: PASS")
sys.exit(0)
else:
print("❌ Task 3 verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/votenet/dataset_comparison/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
Analyze the codebase to map ScanNet object categories to SUN RGB-D categories and calculate object counts.
### Task Objectives
1. **Primary Goal**: Use SUN RGB-D's 10-category classification system as the target taxonomy
2. **Mapping Requirement**: Map each ScanNet object category (using the "category" field, not "raw_category") to the corresponding SUN RGB-D category
3. **Calculation**: For each SUN RGB-D category, calculate the total count of objects from ScanNet that map to that category (It only counts if the category (not raw category) name are exactly the same(night_stand = nightstand))
4. **Output**: Generate an analysis.txt file in the main directory showing the mapping and counts
### Expected Output
Create a file named `analysis.txt` in the test directory root with the following format:
- Each SUN RGB-D category should be represented as a 2-line block
- Line 1: category name
- Line 2: total count
- Each block should be separated by one empty line
================================================
FILE: tasks/filesystem/standard/votenet/dataset_comparison/meta.json
================================================
{
"task_id": "dataset_comparison",
"task_name": "Dataset Comparison",
"category_id": "votenet",
"category_name": "Votenet",
"description": "Map ScanNet object categories to their SUN RGB-D equivalents and calculate detailed object counts for each mapped category.",
"author": "Lingjun Chen",
"created_at": "2025-08-13",
"difficulty": "L3",
"tags": [
"cross-referencing",
"data extraction",
"pattern analysis"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "votenet/\n ├── doc/\n │ ├── teaser.jpg\n │ └── tips.md\n ├── models/\n │ ├── ap_helper.py\n │ ├── backbone_module.py\n │ ├── boxnet.py\n │ ├── dump_helper.py\n │ ├── loss_helper.py\n │ ├── loss_helper_boxnet.py\n │ ├── proposal_module.py\n │ ├── votenet.py\n │ └── voting_module.py\n ├── pointnet2/\n │ ├── _ext_src/\n │ │ ├── include/\n │ │ │ ├── ball_query.h\n │ │ │ ├── cuda_utils.h\n │ │ │ ├── group_points.h\n │ │ │ ├── interpolate.h\n │ │ │ ├── sampling.h\n │ │ │ └── utils.h\n │ │ └── src/\n │ │ ├── ball_query.cpp\n │ │ ├── ball_query_gpu.cu\n │ │ ├── bindings.cpp\n │ │ ├── group_points.cpp\n │ │ ├── group_points_gpu.cu\n │ │ ├── interpolate.cpp\n │ │ ├── interpolate_gpu.cu\n │ │ ├── sampling.cpp\n │ │ └── sampling_gpu.cu\n │ ├── pointnet2_modules.py\n │ ├── pointnet2_test.py\n │ ├── pointnet2_utils.py\n │ ├── pytorch_utils.py\n │ └── setup.py\n ├── scannet/\n │ ├── meta_data/\n │ │ ├── scannet_means.npz\n │ │ ├── scannet_train.txt\n │ │ ├── scannetv2-labels.combined.tsv\n │ │ ├── scannetv2_test.txt\n │ │ ├── scannetv2_train.txt\n │ │ └── scannetv2_val.txt\n │ ├── scans/\n │ ├── batch_load_scannet_data.py\n │ ├── data_viz.py\n │ ├── load_scannet_data.py\n │ ├── model_util_scannet.py\n │ ├── README.md\n │ ├── scannet_detection_dataset.py\n │ └── scannet_utils.py\n ├── sunrgbd/\n │ ├── matlab/\n │ │ ├── extract_rgbd_data_v1.m\n │ │ ├── extract_rgbd_data_v2.m\n │ │ └── extract_split.m\n │ ├── OFFICIAL_SUNRGBD/\n │ ├── sunrgbd_trainval/\n │ ├── model_util_sunrgbd.py\n │ ├── README.md\n │ ├── sunrgbd_data.py\n │ ├── sunrgbd_detection_dataset.py\n │ └── sunrgbd_utils.py\n ├── utils/\n │ ├── box_util.py\n │ ├── eval_det.py\n │ ├── metric_util.py\n │ ├── nms.py\n │ ├── nn_distance.py\n │ ├── pc_util.py\n │ ├── tf_logger.py\n │ └── tf_visualizer.py\n ├── CODE_OF_CONDUCT.md\n ├── CONTRIBUTING.md\n ├── demo.py\n ├── eval.py\n ├── LICENSE\n ├── README.md\n └── train.py",
"stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip",
"stateOriginalUrl": "https://github.com/facebookresearch/votenet"
}
}
================================================
FILE: tasks/filesystem/standard/votenet/dataset_comparison/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Votenet Dataset Comparison Task
"""
import sys
from pathlib import Path
import re
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_analysis_file_exists(test_dir: Path) -> bool:
"""Verify that the analysis.txt file exists."""
analysis_file = test_dir / "analysis.txt"
if not analysis_file.exists():
print("❌ File 'analysis.txt' not found")
return False
print("✅ Analysis file found")
return True
def verify_analysis_format(test_dir: Path) -> bool:
"""Verify that the analysis file has the correct format."""
analysis_file = test_dir / "analysis.txt"
try:
content = analysis_file.read_text()
lines = content.split('\n')
# Check if content is not empty
if not content.strip():
print("❌ Analysis file is empty")
return False
# Check if we have enough lines for at least one category block
if len(lines) < 2:
print("❌ Analysis file doesn't have enough lines for a category block")
return False
# Check if the format follows the 2-line block pattern with empty lines between blocks
# Each block should have: category_name, count, empty_line
line_index = 0
block_count = 0
while line_index < len(lines):
# Skip leading empty lines
while line_index < len(lines) and lines[line_index].strip() == "":
line_index += 1
if line_index >= len(lines):
break
# Check if we have at least 2 lines for a block
if line_index + 1 >= len(lines):
print("❌ Incomplete category block at the end")
return False
# Line 1 should be category name
category_line = lines[line_index].strip()
if not category_line:
print(f"❌ Empty category name at line {line_index + 1}")
return False
# Line 2 should be count
count_line = lines[line_index + 1].strip()
if not count_line:
print(f"❌ Empty count at line {line_index + 2}")
return False
# Check if count line contains a number
if not re.search(r'\d+', count_line):
print(f"❌ Count line doesn't contain a number at line {line_index + 2}: '{count_line}'")
return False
block_count += 1
line_index += 2
# Skip empty line between blocks (if not at the end)
if line_index < len(lines) and lines[line_index].strip() == "":
line_index += 1
if block_count == 0:
print("❌ No valid category blocks found")
return False
print(f"✅ Analysis format is correct with {block_count} category blocks")
return True
except Exception as e:
print(f"❌ Error reading analysis file: {e}")
return False
def verify_required_categories(test_dir: Path) -> bool:
"""Verify that all required SUN RGB-D categories are present."""
analysis_file = test_dir / "analysis.txt"
try:
content = analysis_file.read_text()
lines = content.split('\n')
# Extract category names from the file
categories_found = []
line_index = 0
while line_index < len(lines):
# Skip empty lines
while line_index < len(lines) and lines[line_index].strip() == "":
line_index += 1
if line_index >= len(lines):
break
# Get category name
category_line = lines[line_index].strip()
if category_line:
categories_found.append(category_line.lower())
# Skip to next block
line_index += 2
while line_index < len(lines) and lines[line_index].strip() == "":
line_index += 1
# Required categories
required_categories = {
'chair', 'table', 'bed', 'bookshelf', 'desk',
'toilet', 'dresser', 'bathtub', 'sofa', 'night_stand'
}
# Check if all required categories are present
missing_categories = required_categories - set(categories_found)
if missing_categories:
print(f"❌ Missing required categories: {missing_categories}")
return False
# Check for extra categories
extra_categories = set(categories_found) - required_categories
if extra_categories:
print(f"⚠️ Extra categories found: {extra_categories}")
print(f"✅ All required categories present: {sorted(required_categories)}")
return True
except Exception as e:
print(f"❌ Error verifying required categories: {e}")
return False
def verify_category_counts(test_dir: Path) -> bool:
"""Verify that the category counts match the expected values."""
analysis_file = test_dir / "analysis.txt"
try:
content = analysis_file.read_text()
lines = content.split('\n')
# Expected counts from answer.txt
expected_counts = {
'chair': 4681,
'table': 1170,
'bed': 370,
'bookshelf': 377,
'desk': 680,
'toilet': 256,
'dresser': 213,
'bathtub': 144,
'sofa': 1,
'night_stand': 224
}
# Extract category counts from the file
category_counts = {}
line_index = 0
while line_index < len(lines):
# Skip empty lines
while line_index < len(lines) and lines[line_index].strip() == "":
line_index += 1
if line_index >= len(lines):
break
# Get category name
category_line = lines[line_index].strip()
if not category_line:
line_index += 1
continue
# Get count
if line_index + 1 < len(lines):
count_line = lines[line_index + 1].strip()
if count_line:
# Extract number from count line
count_match = re.search(r'(\d+)', count_line)
if count_match:
category = category_line.lower()
count = int(count_match.group(1))
category_counts[category] = count
# Skip to next block
line_index += 2
while line_index < len(lines) and lines[line_index].strip() == "":
line_index += 1
# Verify counts match expected values
all_counts_correct = True
for category, expected_count in expected_counts.items():
if category in category_counts:
actual_count = category_counts[category]
if actual_count != expected_count:
print(f"❌ Count mismatch for {category}: expected {expected_count}, got {actual_count}")
all_counts_correct = False
else:
print(f"❌ Category {category} not found in analysis")
all_counts_correct = False
if all_counts_correct:
print("✅ All category counts match expected values")
return True
else:
return False
except Exception as e:
print(f"❌ Error verifying category counts: {e}")
return False
def verify_file_structure(test_dir: Path) -> bool:
"""Verify that the analysis.txt file is in the correct location."""
analysis_file = test_dir / "analysis.txt"
if not analysis_file.exists():
print("❌ Analysis file not found in test directory root")
return False
# Check if it's directly in the test directory root, not in a subdirectory
if analysis_file.parent != test_dir:
print("❌ Analysis file should be in the test directory root")
return False
print("✅ Analysis file is in the correct location")
return True
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying Votenet Dataset Comparison Task...")
# Define verification steps
verification_steps = [
("Analysis File Exists", verify_analysis_file_exists),
("File Location", verify_file_structure),
("File Format", verify_analysis_format),
("Required Categories", verify_required_categories),
("Category Counts", verify_category_counts),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Votenet dataset comparison task completed correctly!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/votenet/debugging/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
There is a bug in the VoteNet backbone module that needs to be identified and fixed.
### Task Objectives
1. **Examine the codebase** using filesystem MCP tools
2. **Identify the bug** inside the hole process
3. **Fix the bug** in the code
4. **Create an answer file** with the bug location
### Expected Output
1. **Fix the bug** in the code file directly
2. **Create `answer.txt`** in the test directory root with the format: `path`
**Requirements:**
- Only include the bug's file path in answer.txt
- No additional text or explanation
### Hint
**The bug is not in demo.py**, please look deeper inside the codebase.
================================================
FILE: tasks/filesystem/standard/votenet/debugging/meta.json
================================================
{
"task_id": "debugging",
"task_name": "Debugging",
"category_id": "votenet",
"category_name": "Votenet",
"description": "Identify and fix bugs in the VoteNet backbone module by examining the codebase and implementing necessary corrections.",
"author": "Lingjun Chen",
"created_at": "2025-08-13",
"difficulty": "L3",
"tags": [
"code exploration"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "votenet/\n ├── doc/\n │ ├── teaser.jpg\n │ └── tips.md\n ├── models/\n │ ├── ap_helper.py\n │ ├── backbone_module.py\n │ ├── boxnet.py\n │ ├── dump_helper.py\n │ ├── loss_helper.py\n │ ├── loss_helper_boxnet.py\n │ ├── proposal_module.py\n │ ├── votenet.py\n │ └── voting_module.py\n ├── pointnet2/\n │ ├── _ext_src/\n │ │ ├── include/\n │ │ │ ├── ball_query.h\n │ │ │ ├── cuda_utils.h\n │ │ │ ├── group_points.h\n │ │ │ ├── interpolate.h\n │ │ │ ├── sampling.h\n │ │ │ └── utils.h\n │ │ └── src/\n │ │ ├── ball_query.cpp\n │ │ ├── ball_query_gpu.cu\n │ │ ├── bindings.cpp\n │ │ ├── group_points.cpp\n │ │ ├── group_points_gpu.cu\n │ │ ├── interpolate.cpp\n │ │ ├── interpolate_gpu.cu\n │ │ ├── sampling.cpp\n │ │ └── sampling_gpu.cu\n │ ├── pointnet2_modules.py\n │ ├── pointnet2_test.py\n │ ├── pointnet2_utils.py\n │ ├── pytorch_utils.py\n │ └── setup.py\n ├── scannet/\n │ ├── meta_data/\n │ │ ├── scannet_means.npz\n │ │ ├── scannet_train.txt\n │ │ ├── scannetv2-labels.combined.tsv\n │ │ ├── scannetv2_test.txt\n │ │ ├── scannetv2_train.txt\n │ │ └── scannetv2_val.txt\n │ ├── scans/\n │ ├── batch_load_scannet_data.py\n │ ├── data_viz.py\n │ ├── load_scannet_data.py\n │ ├── model_util_scannet.py\n │ ├── README.md\n │ ├── scannet_detection_dataset.py\n │ └── scannet_utils.py\n ├── sunrgbd/\n │ ├── matlab/\n │ │ ├── extract_rgbd_data_v1.m\n │ │ ├── extract_rgbd_data_v2.m\n │ │ └── extract_split.m\n │ ├── OFFICIAL_SUNRGBD/\n │ ├── sunrgbd_trainval/\n │ ├── model_util_sunrgbd.py\n │ ├── README.md\n │ ├── sunrgbd_data.py\n │ ├── sunrgbd_detection_dataset.py\n │ └── sunrgbd_utils.py\n ├── utils/\n │ ├── box_util.py\n │ ├── eval_det.py\n │ ├── metric_util.py\n │ ├── nms.py\n │ ├── nn_distance.py\n │ ├── pc_util.py\n │ ├── tf_logger.py\n │ └── tf_visualizer.py\n ├── CODE_OF_CONDUCT.md\n ├── CONTRIBUTING.md\n ├── demo.py\n ├── eval.py\n ├── LICENSE\n ├── README.md\n └── train.py",
"stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip",
"stateOriginalUrl": "https://github.com/facebookresearch/votenet"
}
}
================================================
FILE: tasks/filesystem/standard/votenet/debugging/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for VoteNet Task: Debug Backbone Module
"""
import sys
from pathlib import Path
import re
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_answer_file_exists(test_dir: Path) -> bool:
"""Verify that the answer.txt file exists."""
answer_file = test_dir / "answer.txt"
if not answer_file.exists():
print("❌ File 'answer.txt' not found")
return False
print("✅ Answer file found")
return True
def verify_answer_format(test_dir: Path) -> bool:
"""Verify that the answer file has the correct format."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# Check if content is not empty
if not content:
print("❌ Answer file is empty")
return False
# Check if it contains only one line (no additional text)
if len(content.split('\n')) > 1:
print("❌ Answer file contains multiple lines or additional text")
return False
# Check if path contains the expected components
if 'models/backbone_module.py' not in content:
print("❌ Answer should contain 'models/backbone_module.py'")
return False
print("✅ Answer format is correct")
return True
except Exception as e:
print(f"❌ Error reading answer file: {e}")
return False
def verify_file_path_structure(test_dir: Path) -> bool:
"""Verify that the file path has the expected structure."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# Expected path components for backbone module
expected_components = ["models", "backbone_module.py"]
# Check if all expected components are in the content
for component in expected_components:
if component not in content:
print(f"❌ Answer missing expected component: {component}")
return False
print("✅ Answer contains expected components")
return True
except Exception as e:
print(f"❌ Error verifying answer structure: {e}")
return False
def verify_file_exists(test_dir: Path) -> bool:
"""Verify that the identified file actually exists."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
# Try the expected path
file_path = test_dir / "models/backbone_module.py"
if not file_path.exists():
print(f"❌ Expected file does not exist: models/backbone_module.py")
return False
print("✅ Expected file exists")
return True
except Exception as e:
print(f"❌ Error verifying file existence: {e}")
return False
def verify_bug_fix(test_dir: Path) -> bool:
"""Verify that the bug has been fixed in the code."""
answer_file = test_dir / "answer.txt"
try:
content = answer_file.read_text().strip()
file_path = test_dir / "models/backbone_module.py"
if not file_path.exists():
print(f"❌ Cannot find file for bug fix verification: models/backbone_module.py")
return False
# Read the file and search for the specific line containing self.fp2 = PointnetFPModule
file_content = file_path.read_text()
lines = file_content.split('\n')
# Find the line containing self.fp2 = PointnetFPModule
target_line = None
target_line_number = None
for i, line in enumerate(lines):
if "self.fp2 = PointnetFPModule" in line:
target_line = line.strip()
target_line_number = i + 1 # Convert to 1-based line number
break
if target_line is None:
print("❌ Could not find line containing 'self.fp2 = PointnetFPModule'")
return False
# Check if the original buggy line still exists
original_bug = "self.fp2 = PointnetFPModule(mlp=[256,256,256])"
if original_bug in target_line:
print(f"❌ Bug has not been fixed - original line still exists at line {target_line_number}")
print(f" Line {target_line_number} content: {target_line}")
return False
# Check for the correct fix
correct_fixes = [
"self.fp2 = PointnetFPModule(mlp=[256+256,256,256])",
"self.fp2 = PointnetFPModule(mlp=[512,256,256])"
]
fix_found = False
for fix in correct_fixes:
if fix in target_line:
fix_found = True
break
if not fix_found:
print(f"❌ Bug fix not found at line {target_line_number}")
print(f" Line {target_line_number} content: {target_line}")
print(" Expected one of:")
for fix in correct_fixes:
print(f" - {fix}")
return False
print(f"✅ Bug has been fixed correctly at line {target_line_number}")
return True
except Exception as e:
print(f"❌ Error verifying bug fix: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying VoteNet Task: Debug Backbone Module...")
# Define verification steps
verification_steps = [
("Answer File Exists", verify_answer_file_exists),
("Answer Format", verify_answer_format),
("Answer Structure", verify_file_path_structure),
("File Exists", verify_file_exists),
("Bug Fix Applied", verify_bug_fix),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ VoteNet backbone module bug has been correctly identified and fixed!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/filesystem/standard/votenet/requirements_writing/description.md
================================================
Please use FileSystem tools to finish the following task:
### Task Description
The VoteNet project is a 3D object detection framework for point clouds. Your task is to create a `requirements.txt` file that lists all the necessary Python dependencies for running this codebase.
### Task Objectives
1. **Create a requirements.txt file** in the main directory
2. **Include all essential dependencies** needed to run the VoteNet codebase
3. **Ensure the file format is correct** (one dependency per line)
4. **Save the file as `requirements.txt`** in the current working directory
5. **Not just** pip install or conda install, your answer should contain **every necessary dependencies in the hole process of VoteNet**.
### Requirements
The requirements.txt file should contain Python packages that are necessary for:
- 3D point cloud processing
- Deep learning frameworks
- Computer vision libraries
- Data visualization
- 3D mesh processing
- Network/graph operations
### Note
- You can examine the codebase structure and README to understand what packages are needed
- The file should be saved as `requirements.txt` in the current directory
- Each dependency should be on a separate line
================================================
FILE: tasks/filesystem/standard/votenet/requirements_writing/meta.json
================================================
{
"task_id": "requirements_writing",
"task_name": "Requirements Writing",
"category_id": "votenet",
"category_name": "VoteNet",
"description": "Generate a complete requirements.txt file containing all necessary Python dependencies for running the VoteNet codebase successfully.",
"author": "Lingjun Chen",
"created_at": "2025-08-13",
"difficulty": "L3",
"tags": [
"code exploration",
"cross-referencing"
],
"mcp": [
"filesystem"
],
"meta_data": {
"stateType": "text",
"stateContent": "votenet/\n ├── doc/\n │ ├── teaser.jpg\n │ └── tips.md\n ├── models/\n │ ├── ap_helper.py\n │ ├── backbone_module.py\n │ ├── boxnet.py\n │ ├── dump_helper.py\n │ ├── loss_helper.py\n │ ├── loss_helper_boxnet.py\n │ ├── proposal_module.py\n │ ├── votenet.py\n │ └── voting_module.py\n ├── pointnet2/\n │ ├── _ext_src/\n │ │ ├── include/\n │ │ │ ├── ball_query.h\n │ │ │ ├── cuda_utils.h\n │ │ │ ├── group_points.h\n │ │ │ ├── interpolate.h\n │ │ │ ├── sampling.h\n │ │ │ └── utils.h\n │ │ └── src/\n │ │ ├── ball_query.cpp\n │ │ ├── ball_query_gpu.cu\n │ │ ├── bindings.cpp\n │ │ ├── group_points.cpp\n │ │ ├── group_points_gpu.cu\n │ │ ├── interpolate.cpp\n │ │ ├── interpolate_gpu.cu\n │ │ ├── sampling.cpp\n │ │ └── sampling_gpu.cu\n │ ├── pointnet2_modules.py\n │ ├── pointnet2_test.py\n │ ├── pointnet2_utils.py\n │ ├── pytorch_utils.py\n │ └── setup.py\n ├── scannet/\n │ ├── meta_data/\n │ │ ├── scannet_means.npz\n │ │ ├── scannet_train.txt\n │ │ ├── scannetv2-labels.combined.tsv\n │ │ ├── scannetv2_test.txt\n │ │ ├── scannetv2_train.txt\n │ │ └── scannetv2_val.txt\n │ ├── scans/\n │ ├── batch_load_scannet_data.py\n │ ├── data_viz.py\n │ ├── load_scannet_data.py\n │ ├── model_util_scannet.py\n │ ├── README.md\n │ ├── scannet_detection_dataset.py\n │ └── scannet_utils.py\n ├── sunrgbd/\n │ ├── matlab/\n │ │ ├── extract_rgbd_data_v1.m\n │ │ ├── extract_rgbd_data_v2.m\n │ │ └── extract_split.m\n │ ├── OFFICIAL_SUNRGBD/\n │ ├── sunrgbd_trainval/\n │ ├── model_util_sunrgbd.py\n │ ├── README.md\n │ ├── sunrgbd_data.py\n │ ├── sunrgbd_detection_dataset.py\n │ └── sunrgbd_utils.py\n ├── utils/\n │ ├── box_util.py\n │ ├── eval_det.py\n │ ├── metric_util.py\n │ ├── nms.py\n │ ├── nn_distance.py\n │ ├── pc_util.py\n │ ├── tf_logger.py\n │ └── tf_visualizer.py\n ├── CODE_OF_CONDUCT.md\n ├── CONTRIBUTING.md\n ├── demo.py\n ├── eval.py\n ├── LICENSE\n ├── README.md\n └── train.py",
"stateUrl": "https://storage.mcpmark.ai/filesystem/votenet.zip",
"stateOriginalUrl": "https://github.com/facebookresearch/votenet"
}
}
================================================
FILE: tasks/filesystem/standard/votenet/requirements_writing/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for VoteNet Task: Create Requirements.txt File
"""
import sys
from pathlib import Path
import os
def get_test_directory() -> Path:
"""Get the test directory from FILESYSTEM_TEST_DIR env var."""
test_root = os.environ.get("FILESYSTEM_TEST_DIR")
if not test_root:
raise ValueError("FILESYSTEM_TEST_DIR environment variable is required")
return Path(test_root)
def verify_requirements_file_exists(test_dir: Path) -> bool:
"""Verify that the requirements.txt file exists."""
requirements_file = test_dir / "requirements.txt"
if not requirements_file.exists():
print("❌ File 'requirements.txt' not found")
return False
print("✅ Requirements.txt file found")
return True
def verify_requirements_file_readable(test_dir: Path) -> bool:
"""Verify that the requirements.txt file is readable."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
if not content.strip():
print("❌ Requirements.txt file is empty")
return False
print("✅ Requirements.txt file is readable")
return True
except Exception as e:
print(f"❌ Error reading requirements.txt file: {e}")
return False
def verify_required_dependencies_present(test_dir: Path) -> bool:
"""Verify that all required dependencies are present."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
# Required dependencies from answer.txt
required_deps = [
"matplotlib",
"opencv",
"plyfile",
"trimesh",
"pointnet2",
"networkx"
]
missing_deps = []
found_deps = []
for dep in required_deps:
if dep.lower() in content.lower():
found_deps.append(dep)
else:
missing_deps.append(dep)
if missing_deps:
print(f"❌ Missing required dependencies: {missing_deps}")
return False
print(f"✅ All required dependencies found: {found_deps}")
return True
except Exception as e:
print(f"❌ Error checking dependencies: {e}")
return False
def verify_file_format(test_dir: Path) -> bool:
"""Verify that the requirements.txt file has proper format."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
lines = content.split('\n')
# Check if file has content and proper line structure
if not content.strip():
print("❌ File is completely empty")
return False
# Check if there are multiple lines (indicating multiple dependencies)
non_empty_lines = [line.strip() for line in lines if line.strip()]
if len(non_empty_lines) < 3: # Should have at least 3 dependencies
print("❌ File seems to have too few dependencies")
return False
print("✅ File format is acceptable")
return True
except Exception as e:
print(f"❌ Error checking file format: {e}")
return False
def verify_no_duplicate_entries(test_dir: Path) -> bool:
"""Verify that there are no duplicate dependency entries."""
requirements_file = test_dir / "requirements.txt"
try:
content = requirements_file.read_text()
lines = [line.strip().lower() for line in content.split('\n') if line.strip()]
# Check for duplicates
if len(lines) != len(set(lines)):
print("❌ File contains duplicate entries")
return False
print("✅ No duplicate entries found")
return True
except Exception as e:
print(f"❌ Error checking for duplicates: {e}")
return False
def main():
"""Main verification function."""
test_dir = get_test_directory()
print("🔍 Verifying VoteNet Task: Create Requirements.txt File...")
# Define verification steps
verification_steps = [
("Requirements File Exists", verify_requirements_file_exists),
("File is Readable", verify_requirements_file_readable),
("Required Dependencies Present", verify_required_dependencies_present),
("File Format", verify_file_format),
("No Duplicate Entries", verify_no_duplicate_entries),
]
# Run all verification steps
all_passed = True
for step_name, verify_func in verification_steps:
print(f"\n--- {step_name} ---")
if not verify_func(test_dir):
all_passed = False
# Final result
print("\n" + "="*50)
if all_passed:
print("✅ Requirements.txt file successfully created with all required dependencies!")
print("🎉 Task verification: PASS")
sys.exit(0)
else:
print("❌ Task verification: FAIL")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/github/easy/build-your-own-x/close_commented_issues/description.md
================================================
Use the GitHub MCP tools to close every issue in `mcpmark-eval/build-your-own-x` that already has at least one comment. Leave all other issues unchanged.
================================================
FILE: tasks/github/easy/build-your-own-x/close_commented_issues/meta.json
================================================
{
"task_id": "close_commented_issues",
"task_name": "Close Commented Issues",
"category_id": "build-your-own-x",
"category_name": "Build Your Own X (Easy)",
"description": "Use GitHub MCP tools to close every issue with comments in build-your-own-x and leave everything else alone.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"issue management"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/build-your-own-x",
"stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x"
}
}
================================================
FILE: tasks/github/easy/build-your-own-x/close_commented_issues/verify.py
================================================
import os
import sys
from typing import Optional
import requests
from dotenv import load_dotenv
REPO_NAME = "build-your-own-x"
TARGET_ISSUES = [23, 25]
def _fetch_issue(org: str, token: str, number: int) -> Optional[dict]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/issues/{number}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc:
print(f"Request error for issue #{number}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} when fetching issue #{number}",
file=sys.stderr,
)
return None
try:
return response.json()
except Exception as exc:
print(f"Unable to parse issue #{number}: {exc}", file=sys.stderr)
return None
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
print("Checking issue states in remote repository...")
success = True
for issue_number in TARGET_ISSUES:
data = _fetch_issue(org, token, issue_number)
if data is None:
success = False
continue
state = data.get("state", "").lower()
if state != "closed":
print(
f"Issue #{issue_number} is '{state}' but must be closed.",
file=sys.stderr,
)
success = False
else:
print(f"Issue #{issue_number} is closed as expected.")
return success
if __name__ == "__main__":
sys.exit(0 if verify() else 1)
================================================
FILE: tasks/github/easy/build-your-own-x/record_recent_commits/description.md
================================================
Use the GitHub MCP tools to work in the `mcpmark-eval/build-your-own-x` repository.
1. Retrieve the newest five commits on the default branch.
2. Open a new issue titled exactly `Latest 5 Commit Snapshot`.
3. Set the issue body to exactly this format (newest commit first):
```
Latest 5 commits (newest first)
1. | |
2. | |
3. | |
4. | |
5. | |
```
Use the full 40-character SHA and only the first line of each commit message. The `` must come from the commit metadata's author name field (not the GitHub username/login). Leave the issue open and do not touch other issues.
================================================
FILE: tasks/github/easy/build-your-own-x/record_recent_commits/meta.json
================================================
{
"task_id": "record_recent_commits",
"task_name": "Record Recent Commits",
"category_id": "build-your-own-x",
"category_name": "Build Your Own X (Easy)",
"description": "Summarize the latest five commits by opening an issue with their SHAs, authors, and subjects.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"commits",
"issue"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/build-your-own-x",
"stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x"
}
}
================================================
FILE: tasks/github/easy/build-your-own-x/record_recent_commits/verify.py
================================================
import os
import sys
from typing import List, Optional
import requests
from dotenv import load_dotenv
REPO_NAME = "build-your-own-x"
BRANCH = "master"
ISSUE_TITLE = "Latest 5 Commit Snapshot"
EXPECTED_HEADER = "latest 5 commits (newest first)"
def _request(url: str, token: str) -> Optional[requests.Response]:
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc: # pragma: no cover - network errors
print(f"Request error for {url}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} for {url}",
file=sys.stderr,
)
return None
return response
def _fetch_commits(org: str, token: str) -> Optional[List[dict]]:
url = (
f"https://api.github.com/repos/{org}/{REPO_NAME}/commits"
f"?per_page=5&sha={BRANCH}"
)
response = _request(url, token)
if response is None:
return None
try:
return response.json()
except Exception as exc:
print(f"Unable to parse commits: {exc}", file=sys.stderr)
return None
def _find_issue(org: str, token: str) -> Optional[dict]:
page = 1
while True:
url = (
f"https://api.github.com/repos/{org}/{REPO_NAME}/issues"
f"?state=open&per_page=100&page={page}"
)
response = _request(url, token)
if response is None:
return None
try:
issues = response.json()
except Exception as exc:
print(f"Unable to parse issues: {exc}", file=sys.stderr)
return None
if not issues:
break
for issue in issues:
if issue.get("title") == ISSUE_TITLE:
# Exclude pull requests
if "pull_request" in issue:
continue
return issue
page += 1
print(
f"No open issue titled '{ISSUE_TITLE}' was found.",
file=sys.stderr,
)
return None
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
commits = _fetch_commits(org, token)
if commits is None:
return False
if len(commits) < 5:
print("Less than five commits returned; cannot verify.", file=sys.stderr)
return False
issue = _find_issue(org, token)
if issue is None:
return False
if issue.get("title") != ISSUE_TITLE:
print(
f"Found issue title '{issue.get('title')}', expected '{ISSUE_TITLE}'.",
file=sys.stderr,
)
return False
if (issue.get("state") or "").lower() != "open":
print("Issue must remain open.", file=sys.stderr)
return False
body = issue.get("body") or ""
if not body.strip():
print("Issue body is empty.", file=sys.stderr)
return False
lines = [line.strip() for line in body.splitlines() if line.strip()]
if not lines:
print("Issue body contains no content.", file=sys.stderr)
return False
header = lines[0].lower()
if header != EXPECTED_HEADER:
print(
"Issue body must start with 'Latest 5 commits (newest first)'.",
file=sys.stderr,
)
return False
entries = lines[1:]
if len(entries) != 5:
print("Issue body must list exactly five commit entries.", file=sys.stderr)
return False
for idx in range(5):
commit = commits[idx]
sha = commit.get("sha", "")
subject = (commit.get("commit", {}).get("message", "").splitlines()[0]).strip()
author = commit.get("commit", {}).get("author", {}).get("name", "")
expected_line = f"{idx + 1}. {sha} | {author} | {subject}"
actual_line = entries[idx]
if actual_line != expected_line:
print(
f"Entry {idx + 1} mismatch.\nExpected: {expected_line}\nFound: {actual_line}",
file=sys.stderr,
)
return False
print("Issue contains the expected latest five commits.")
return True
if __name__ == "__main__":
sys.exit(0 if verify() else 1)
================================================
FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/description.md
================================================
Use the GitHub MCP tools to edit the `mcpmark-eval/claude-code` repository.
1. On the `main` branch, add a new file `docs/TERMINAL_SHORTCUTS.md` containing exactly:
```
# Terminal Shortcuts
- `claude plan`: Outline the next steps before making edits.
- `claude apply`: Run the plan and apply the queued changes.
- `claude check`: Re-run relevant tests or linters to validate the edits.
```
2. Commit with the message `docs: add terminal shortcuts reference` and push directly to `main`.
================================================
FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/meta.json
================================================
{
"task_id": "add_terminal_shortcuts_doc",
"task_name": "Add Terminal Shortcuts Doc",
"category_id": "claude-code",
"category_name": "Claude Code (Easy)",
"description": "Add a simple terminal shortcuts reference file to docs/TERMINAL_SHORTCUTS.md and push it to main.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"docs update",
"content creation"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/claude-code",
"stateOriginalUrl": "https://github.com/anthropics/claude-code"
}
}
================================================
FILE: tasks/github/easy/claude-code/add_terminal_shortcuts_doc/verify.py
================================================
import base64
import os
import sys
from typing import Optional
import requests
from dotenv import load_dotenv
REPO_NAME = "claude-code"
TARGET_FILE = "docs/TERMINAL_SHORTCUTS.md"
BRANCH = "main"
EXPECTED_CONTENT = """# Terminal Shortcuts
- `claude plan`: Outline the next steps before making edits.
- `claude apply`: Run the plan and apply the queued changes.
- `claude check`: Re-run relevant tests or linters to validate the edits.
""".strip()
def _download_file(org: str, token: str) -> Optional[str]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{TARGET_FILE}?ref={BRANCH}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc:
print(f"Request error for {TARGET_FILE}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} when fetching {TARGET_FILE}",
file=sys.stderr,
)
return None
data = response.json()
try:
content = base64.b64decode(data.get("content", "")).decode("utf-8").strip()
except Exception as exc:
print(f"Unable to decode {TARGET_FILE}: {exc}", file=sys.stderr)
return None
return content
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
print(f"Checking {TARGET_FILE} in remote repository...")
content = _download_file(org, token)
if content is None:
return False
normalized = content.strip()
if normalized != EXPECTED_CONTENT:
print("TERMINAL_SHORTCUTS.md does not match the expected content.", file=sys.stderr)
print("Expected:")
print(EXPECTED_CONTENT)
print("Found:")
print(content)
return False
print("All checks passed! docs/TERMINAL_SHORTCUTS.md contains the expected text.")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/easy/claude-code/thank_docker_pr_author/description.md
================================================
Use the GitHub MCP tools to comment on the pull request in `mcpmark-eval/claude-code` that proposes automating Docker image builds with GitHub Actions.
1. Skim the PR description so you understand it’s the Docker workflow automation proposal.
2. Add a new comment on that PR that thanks the author and contains all of these keywords: `Docker workflow`, `automation`, `review`.
================================================
FILE: tasks/github/easy/claude-code/thank_docker_pr_author/meta.json
================================================
{
"task_id": "thank_docker_pr_author",
"task_name": "Thank Docker PR Author",
"category_id": "claude-code",
"category_name": "Claude Code (Easy)",
"description": "Leave a thank-you comment on the Docker automation PR mentioning the workflow automation review keywords.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"pull request",
"comment"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/claude-code",
"stateOriginalUrl": "https://github.com/anthropics/claude-code"
}
}
================================================
FILE: tasks/github/easy/claude-code/thank_docker_pr_author/verify.py
================================================
import os
import sys
from typing import Optional, Union
import requests
from dotenv import load_dotenv
REPO_NAME = "claude-code"
PR_NUMBER = 53
KEYWORDS = ["docker workflow", "automation", "review"]
def _github_get(org: str, token: str, path: str) -> Optional[Union[list, dict]]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/{path}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc:
print(f"Request error for {path}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} for {path}",
file=sys.stderr,
)
return None
return response.json()
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
comments = _github_get(org, token, f"issues/{PR_NUMBER}/comments?per_page=100")
if comments is None:
return False
for comment in comments:
body = comment.get("body", "").strip()
lowered = body.lower()
if not body:
continue
if not any(thank_word in lowered for thank_word in ("thanks", "thank you")):
continue
if all(keyword in lowered for keyword in KEYWORDS):
print("All checks passed! Keyword-rich thank-you comment found on PR #53.")
return True
print(
"Did not find a thank-you comment containing all required keywords on PR #53.",
file=sys.stderr,
)
return False
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/description.md
================================================
Use the GitHub MCP tools to triage issue #24 in the `mcpmark-eval/claude-code` repository.
1. Read the issue details to understand the reported API error.
2. Add a triage comment on the issue that explicitly includes all of the following keywords: `invalid_request_error`, `toolu_01Kjp7i9iF3xJ3z9aH4pSaRw`, `tool_result`, `tool_use`. Use them while confirming the API error and asking for the missing result block.
3. Remove the `area:packaging` label from issue #24.
================================================
FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/meta.json
================================================
{
"task_id": "triage_missing_tool_result_issue",
"task_name": "Triage Missing Tool Result Issue",
"category_id": "claude-code",
"category_name": "Claude Code (Easy)",
"description": "Leave a predefined triage comment on issue #24 and remove the area:packaging label.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"issue triage",
"github"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/claude-code",
"stateOriginalUrl": "https://github.com/anthropics/claude-code"
}
}
================================================
FILE: tasks/github/easy/claude-code/triage_missing_tool_result_issue/verify.py
================================================
import os
import sys
from typing import Optional
import requests
from dotenv import load_dotenv
REPO_NAME = "claude-code"
ISSUE_NUMBER = 24
KEYWORDS = [
"invalid_request_error",
"toolu_01kjp7i9if3xj3z9ah4psarw",
"tool_result",
"tool_use",
]
REMOVED_LABEL = "area:packaging"
def _github_get(org: str, token: str, path: str) -> Optional[dict]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/{path}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc:
print(f"Request error for {path}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} for {path}",
file=sys.stderr,
)
return None
return response.json()
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
issue = _github_get(org, token, f"issues/{ISSUE_NUMBER}")
if issue is None:
return False
label_names = {label.get("name", "") for label in issue.get("labels", [])}
if REMOVED_LABEL in label_names:
print(f"Label '{REMOVED_LABEL}' is still present on issue #{ISSUE_NUMBER}.", file=sys.stderr)
return False
comments = _github_get(org, token, f"issues/{ISSUE_NUMBER}/comments?per_page=100")
if comments is None:
return False
found = False
for comment in comments:
body = comment.get("body", "").strip().lower()
if all(keyword in body for keyword in KEYWORDS):
found = True
break
if not found:
print(
"Did not find a triage comment containing all required keywords.",
file=sys.stderr,
)
return False
print("All checks passed! Comment added and label removed.")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/description.md
================================================
Use the GitHub MCP tools to update the `mcpmark-eval/mcpmark-cicd` repository with a very small CI workflow.
## Goal
Add a GitHub Actions workflow named **Basic CI Checks** that automatically runs linting and unit tests any time work is pushed to or proposed for the `main` branch.
## Requirements
1. Create a branch called `basic-ci-checks` from `main`.
2. Add `.github/workflows/basic-ci.yml` with the following characteristics:
- Workflow name: `Basic CI Checks`.
- Trigger on both `push` and `pull_request`, limited to the `main` branch.
- Single job called `quality-checks` that runs on `ubuntu-latest` and uses Node.js 18 (`actions/setup-node`).
- Steps must include `actions/checkout`, `npm ci`, `npm run lint`, and `npm test` in that order after Node is configured.
3. Commit the workflow to your branch, open a pull request titled `Add basic CI checks`, and merge it so the workflow exists on `main`.
That's it—no caching, matrix builds, or issue automation required. Keep it lightweight and focused on verifying the existing lint/test scripts.
================================================
FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/meta.json
================================================
{
"task_id": "basic_ci_checks",
"task_name": "Basic CI Checks",
"category_id": "mcpmark-cicd",
"category_name": "MCPMark CI/CD (Easy)",
"description": "Add a lightweight GitHub Actions workflow that runs npm ci, npm run lint, and npm test whenever main is updated or receives a pull request.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"ci/cd",
"github actions",
"workflow basics"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/github/easy/mcpmark-cicd/basic_ci_checks/verify.py
================================================
import base64
import os
import sys
from typing import List, Optional
import requests
from dotenv import load_dotenv
REPO_NAME = "mcpmark-cicd"
WORKFLOW_PATH = ".github/workflows/basic-ci.yml"
BRANCH = "main"
def _download_file(org: str, token: str, path: str) -> Optional[str]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc: # pragma: no cover - network failure
print(f"Request error for {path}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} when fetching {path}",
file=sys.stderr,
)
return None
data = response.json()
try:
content = base64.b64decode(data.get("content", "")).decode("utf-8")
except Exception as exc:
print(f"Unable to decode {path}: {exc}", file=sys.stderr)
return None
return content
def _line_index(lines: List[str], needle: str) -> int:
for idx, line in enumerate(lines):
if needle in line:
return idx
return -1
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
content = _download_file(org, token, WORKFLOW_PATH)
if content is None:
print(
"Workflow file .github/workflows/basic-ci.yml was not found on main",
file=sys.stderr,
)
return False
normalized = content.lower()
normalized_lines = [line.strip().lower() for line in content.splitlines()]
errors = []
required_snippets = {
"workflow name": "name: basic ci checks",
"job name": "quality-checks",
"checkout step": "actions/checkout",
"setup-node step": "actions/setup-node",
"node version": "node-version: 18",
"ubuntu runner": "runs-on: ubuntu-latest",
"push trigger": "push:",
"pull_request trigger": "pull_request:",
}
for label, snippet in required_snippets.items():
if snippet not in normalized:
errors.append(f"Missing {label} ({snippet}) in workflow")
branch_limited = "- main" in normalized or "[main]" in normalized
if not branch_limited:
errors.append("Workflow triggers must be limited to the main branch")
for command in ["npm ci", "npm run lint", "npm test"]:
if command not in normalized:
errors.append(f"Missing '{command}' step")
# Ensure npm commands happen in the expected order
ci_index = _line_index(normalized_lines, "npm ci")
lint_index = _line_index(normalized_lines, "npm run lint")
test_index = _line_index(normalized_lines, "npm test")
if ci_index == -1 or lint_index == -1 or test_index == -1:
errors.append("Could not find all npm commands to validate ordering")
else:
if not (ci_index < lint_index < test_index):
errors.append("npm commands must run in order: ci -> lint -> test")
if errors:
print("Verification failed:")
for err in errors:
print(f" - {err}", file=sys.stderr)
return False
print("✅ basic-ci workflow found with required steps and triggers")
return True
if __name__ == "__main__":
sys.exit(0 if verify() else 1)
================================================
FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/description.md
================================================
Use the GitHub MCP tools to wire up a tiny issue-triggered lint check for `mcpmark-eval/mcpmark-cicd`.
## Goal
Whenever a maintainer opens the tracking issue **Lint workflow check**, the repo should automatically run `npm run lint` via GitHub Actions. Keep it simple—just prove the workflow fires for issue events.
## Requirements
1. Create a branch called `issue-lint-workflow` from `main`.
2. Add `.github/workflows/issue-lint.yml` with:
- Workflow name **Issue Lint Guard**.
- Trigger: `issues` with `types: [opened]` (no push/PR triggers).
- Single job `lint` on `ubuntu-latest` using Node.js 18 via `actions/setup-node`.
- Steps in order: `actions/checkout`, `npm ci`, `npm run lint`.
3. Open a pull request titled `Add issue lint workflow`, get it merged so the workflow exists on `main`.
4. After the merge, open a new issue titled **Lint workflow check** to trigger the workflow and wait until the matching run finishes successfully. Leave the issue open; we only care that the run went green.
================================================
FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/meta.json
================================================
{
"task_id": "issue_lint_guard",
"task_name": "Issue Lint Guard",
"category_id": "mcpmark-cicd",
"category_name": "MCPMark CI/CD (Easy)",
"description": "Add an issue-triggered lint workflow and prove it runs when the tracking issue is opened.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"ci/cd",
"github actions",
"issues"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/github/easy/mcpmark-cicd/issue_lint_guard/verify.py
================================================
import base64
import os
import sys
import time
from typing import List, Optional
import requests
from dotenv import load_dotenv
REPO_NAME = "mcpmark-cicd"
WORKFLOW_PATH = ".github/workflows/issue-lint.yml"
WORKFLOW_FILE = "issue-lint.yml"
TARGET_BRANCH = "main"
TRACKING_ISSUE_TITLE = "Lint workflow check"
MAX_POLL_ATTEMPTS = 12
POLL_INTERVAL_SECONDS = 10
def _download_file(org: str, token: str, path: str) -> Optional[str]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={TARGET_BRANCH}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc: # pragma: no cover - network error handling
print(f"Request error for {path}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} when fetching {path}",
file=sys.stderr,
)
return None
data = response.json()
try:
content = base64.b64decode(data.get("content", "")).decode("utf-8")
except Exception as exc: # pragma: no cover - decode error
print(f"Unable to decode {path}: {exc}", file=sys.stderr)
return None
return content
def _line_index(lines: List[str], needle: str) -> int:
for idx, line in enumerate(lines):
if needle in line:
return idx
return -1
def _list_workflow_runs(org: str, token: str) -> Optional[List[dict]]:
url = (
f"https://api.github.com/repos/{org}/{REPO_NAME}/actions/workflows/{WORKFLOW_FILE}/runs"
f"?event=issues&per_page=15"
)
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc: # pragma: no cover - network error handling
print(f"Request error when listing workflow runs: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} when listing workflow runs",
file=sys.stderr,
)
return None
data = response.json()
return data.get("workflow_runs", [])
def _wait_for_tracking_issue_run(org: str, token: str) -> bool:
for attempt in range(1, MAX_POLL_ATTEMPTS + 1):
runs = _list_workflow_runs(org, token)
if runs is None:
return False
relevant = [
run
for run in runs
if run.get("display_title") == TRACKING_ISSUE_TITLE
]
if not relevant:
print(
f"[{attempt}/{MAX_POLL_ATTEMPTS}] No Issue Lint Guard run for '{TRACKING_ISSUE_TITLE}' yet; waiting..."
)
time.sleep(POLL_INTERVAL_SECONDS)
continue
latest = relevant[0]
status = latest.get("status")
conclusion = latest.get("conclusion")
html_url = latest.get("html_url")
if status != "completed":
print(
f"[{attempt}/{MAX_POLL_ATTEMPTS}] Latest run is '{status}'; waiting for completion..."
)
time.sleep(POLL_INTERVAL_SECONDS)
continue
if conclusion != "success":
print(
"Latest Issue Lint Guard run finished without success.",
file=sys.stderr,
)
print(f"Status: {status}, Conclusion: {conclusion}", file=sys.stderr)
if html_url:
print(f"Run URL: {html_url}", file=sys.stderr)
return False
if html_url:
print(f"✅ Latest Issue Lint Guard run succeeded: {html_url}")
else:
print("✅ Latest Issue Lint Guard run succeeded")
return True
print(
f"Timed out waiting for a successful Issue Lint Guard run for '{TRACKING_ISSUE_TITLE}'",
file=sys.stderr,
)
return False
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
content = _download_file(org, token, WORKFLOW_PATH)
if content is None:
print(
"Workflow file .github/workflows/issue-lint.yml was not found on main",
file=sys.stderr,
)
return False
normalized = content.lower()
normalized_lines = [line.strip().lower() for line in content.splitlines()]
errors = []
required_snippets = {
"workflow name": "name: issue lint guard",
"issues trigger": "issues:",
"types opened": "types:",
"job name": "lint:",
"runner": "runs-on: ubuntu-latest",
"checkout": "actions/checkout",
"setup-node": "actions/setup-node",
"node version": "node-version: 18",
"npm ci": "npm ci",
"npm run lint": "npm run lint",
}
for label, snippet in required_snippets.items():
if snippet not in normalized:
errors.append(f"Missing {label} ({snippet}) in workflow")
types_line = next(
(line for line in normalized_lines if "types" in line and "opened" in line),
None,
)
if types_line is None:
errors.append("issues trigger must limit types to include 'opened'")
checkout_idx = _line_index(normalized_lines, "actions/checkout")
setup_idx = _line_index(normalized_lines, "actions/setup-node")
ci_idx = _line_index(normalized_lines, "npm ci")
lint_idx = _line_index(normalized_lines, "npm run lint")
if -1 in [checkout_idx, setup_idx, ci_idx, lint_idx]:
errors.append("Could not determine workflow step ordering")
else:
if not (checkout_idx < setup_idx < ci_idx < lint_idx):
errors.append(
"Steps must run in order: checkout -> setup-node -> npm ci -> npm run lint"
)
if errors:
print("Workflow validation failed:")
for err in errors:
print(f" - {err}", file=sys.stderr)
return False
print("✅ issue-lint workflow file looks correct")
return _wait_for_tracking_issue_run(org, token)
if __name__ == "__main__":
sys.exit(0 if verify() else 1)
================================================
FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/description.md
================================================
Use the GitHub MCP tools to add a tiny bit of automation to `mcpmark-eval/mcpmark-cicd`.
Goal: every night the repo should run the existing health check script.
Do the usual branch/PR flow with a branch named `nightly-health` and a PR titled `Add nightly health check`.
Create `.github/workflows/nightly-health.yml` with:
- workflow name `Nightly Health Check`
- triggers: `workflow_dispatch` plus a cron schedule `0 2 * * *`
- one job called `health-check` on `ubuntu-latest`
- use Node.js 18 via `actions/setup-node`
- steps in order: checkout, npm ci, `npm run health-check`
Merge the PR so the workflow lives on `main`.
================================================
FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/meta.json
================================================
{
"task_id": "nightly_health_check",
"task_name": "Nightly Health Check",
"category_id": "mcpmark-cicd",
"category_name": "MCPMark CI/CD (Easy)",
"description": "Add a scheduled workflow that runs the npm health check script every night.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"ci/cd",
"github actions",
"scheduling"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/github/easy/mcpmark-cicd/nightly_health_check/verify.py
================================================
import base64
import os
import sys
from typing import List, Optional
import requests
from dotenv import load_dotenv
REPO_NAME = "mcpmark-cicd"
WORKFLOW_PATH = ".github/workflows/nightly-health.yml"
BRANCH = "main"
def _download_file(org: str, token: str, path: str) -> Optional[str]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc: # pragma: no cover
print(f"Request error for {path}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} when fetching {path}",
file=sys.stderr,
)
return None
data = response.json()
try:
content = base64.b64decode(data.get("content", "")).decode("utf-8")
except Exception as exc:
print(f"Unable to decode {path}: {exc}", file=sys.stderr)
return None
return content
def _line_index(lines: List[str], needle: str) -> int:
for idx, line in enumerate(lines):
if needle in line:
return idx
return -1
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
content = _download_file(org, token, WORKFLOW_PATH)
if content is None:
print(
"Workflow file .github/workflows/nightly-health.yml was not found on main",
file=sys.stderr,
)
return False
normalized = content.lower()
normalized_lines = [line.strip().lower() for line in content.splitlines()]
errors = []
required_bits = {
"workflow name": "name: nightly health check",
"workflow_dispatch trigger": "workflow_dispatch:",
"schedule": "schedule:",
"cron": "0 2 * * *",
"job name": "health-check:",
"runner": "runs-on: ubuntu-latest",
"checkout": "actions/checkout",
"setup-node": "actions/setup-node",
"node version": "node-version: 18",
"npm ci": "npm ci",
"health script": "npm run health-check",
}
for label, snippet in required_bits.items():
if snippet not in normalized:
errors.append(f"Missing {label} ({snippet}) in workflow")
schedule_index = _line_index(normalized_lines, "schedule:")
cron_index = _line_index(normalized_lines, "- cron: '0 2 * * *'")
if cron_index == -1:
cron_index = _line_index(normalized_lines, "cron: '0 2 * * *'")
if cron_index == -1:
cron_index = _line_index(normalized_lines, 'cron: "0 2 * * *"')
if schedule_index == -1 or cron_index == -1 or cron_index < schedule_index:
errors.append("Cron expression must appear under schedule trigger")
ci_index = _line_index(normalized_lines, "npm ci")
health_index = _line_index(normalized_lines, "npm run health-check")
if ci_index == -1 or health_index == -1:
errors.append("npm ci and npm run health-check must both appear")
else:
if not ci_index < health_index:
errors.append("npm ci must run before npm run health-check")
if errors:
print("Verification failed:")
for err in errors:
print(f" - {err}", file=sys.stderr)
return False
print("✅ nightly-health workflow found with required schedule and steps")
return True
if __name__ == "__main__":
sys.exit(0 if verify() else 1)
================================================
FILE: tasks/github/easy/missing-semester/count_translations/description.md
================================================
Use the GitHub MCP tools to inspect the `mcpmark-eval/missing-semester` repository.
1. Navigate the repository to find the list of community translations that appears on the site's home page.
2. Determine how many translation links are currently listed.
3. Record both the count and the specific file you used as evidence by creating an `ANSWER.md` file in the repository root that contains exactly:
```
Translation Count:
Source:
```
4. Commit the new file and push the change to `master`.
================================================
FILE: tasks/github/easy/missing-semester/count_translations/meta.json
================================================
{
"task_id": "count_translations",
"task_name": "Count Translations",
"category_id": "missing-semester",
"category_name": "Missing Semester (Easy)",
"description": "Use GitHub MCP to count the translations listed on the home page, record the value in ANSWER.md, and push the change to master.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content search",
"answer file"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/missing-semester",
"stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
}
}
================================================
FILE: tasks/github/easy/missing-semester/count_translations/verify.py
================================================
import base64
import os
import sys
from typing import Optional
import requests
from dotenv import load_dotenv
REPO_NAME = "missing-semester"
TARGET_FILE = "ANSWER.md"
BRANCH = "master"
EXPECTED_COUNT = "translation count: 14"
EXPECTED_SOURCE = "source: index.md"
def _download_file(org: str, token: str, path: str) -> Optional[str]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{path}?ref={BRANCH}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers, timeout=30)
except Exception as exc:
print(f"Request error for {path}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} when fetching {path}",
file=sys.stderr,
)
return None
data = response.json()
try:
content = base64.b64decode(data.get("content", "")).decode("utf-8").strip()
except Exception as exc:
print(f"Unable to decode {path}: {exc}", file=sys.stderr)
return None
return content
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
print("Checking ANSWER.md in remote repository...")
answer_content = _download_file(org, token, TARGET_FILE)
if answer_content is None:
return False
normalized = " ".join(answer_content.lower().split())
if EXPECTED_COUNT not in normalized:
print(
"ANSWER.md must include 'Translation Count: 14' (spacing/casing ignored).",
file=sys.stderr,
)
print("Found:")
print(answer_content)
return False
if EXPECTED_SOURCE not in normalized:
print(
"ANSWER.md must include 'Source: index.md' (spacing/casing ignored).",
file=sys.stderr,
)
print("Found:")
print(answer_content)
return False
print("All checks passed! ANSWER.md contains the expected count and source.")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/description.md
================================================
Use the GitHub MCP tools to inspect the `mcpmark-eval/missing-semester` repository.
1. Determine the Analytics tracking ID that the Missing Semester site declares in its configuration.
2. Create an `ANSWER.md` file in the repository root that contains exactly:
```
Analytics Tracking ID:
```
3. Commit the new file and push the change to `master`.
================================================
FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/meta.json
================================================
{
"task_id": "find_ga_tracking_id",
"task_name": "Find GA Tracking ID",
"category_id": "missing-semester",
"category_name": "Missing Semester (Easy)",
"description": "Use GitHub MCP to discover the single Google Analytics tracking ID declared in the site configuration, write it to ANSWER.md, and push the change to master.",
"author": "Zijian Wu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"config search",
"analytics",
"answer file"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/missing-semester",
"stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
}
}
================================================
FILE: tasks/github/easy/missing-semester/find_ga_tracking_id/verify.py
================================================
import base64
import os
import sys
from typing import Optional
import requests
from dotenv import load_dotenv
# Accept either wording, regardless of casing
EXPECTED_VARIANTS = {
"google analytics tracking id: g-p7wvhd84d1",
"analytics tracking id: g-p7wvhd84d1",
}
REPO_NAME = "missing-semester"
TARGET_FILE = "ANSWER.md"
BRANCH = "master"
def _download_file(org: str, token: str) -> Optional[str]:
url = f"https://api.github.com/repos/{org}/{REPO_NAME}/contents/{TARGET_FILE}?ref={BRANCH}"
headers = {
"Authorization": f"Bearer {token}",
"Accept": "application/vnd.github+json",
}
try:
response = requests.get(url, headers=headers)
except Exception as exc:
print(f"Request error for {TARGET_FILE}: {exc}", file=sys.stderr)
return None
if response.status_code != 200:
print(
f"GitHub API returned {response.status_code} when fetching {TARGET_FILE}",
file=sys.stderr,
)
return None
data = response.json()
try:
content = base64.b64decode(data.get("content", "")).decode("utf-8").strip()
except Exception as exc:
print(f"Unable to decode {TARGET_FILE}: {exc}", file=sys.stderr)
return None
return content
def verify() -> bool:
load_dotenv(".mcp_env")
token = os.environ.get("MCP_GITHUB_TOKEN")
org = os.environ.get("GITHUB_EVAL_ORG")
if not token:
print("MCP_GITHUB_TOKEN is missing", file=sys.stderr)
return False
if not org:
print("GITHUB_EVAL_ORG is missing", file=sys.stderr)
return False
print("Checking ANSWER.md in remote repository...")
answer_content = _download_file(org, token)
if answer_content is None:
return False
normalized = answer_content.strip().lower()
if normalized not in EXPECTED_VARIANTS:
print("ANSWER.md does not contain an accepted tracking ID format", file=sys.stderr)
print("Accepted variants:", file=sys.stderr)
for variant in EXPECTED_VARIANTS:
print(f" - {variant}", file=sys.stderr)
print(f"Found: {answer_content}", file=sys.stderr)
return False
print("All checks passed! ANSWER.md matches an accepted content variant.")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/build_your_own_x/find_commit_date/description.md
================================================
Find out when the entries in the Voxel Engine section were first created by Daniel Stefanovic. After finding this information, create an ANSWER.md file in the repository with the content being the date in [YYYY]-[MM]-[DD] format (e.g., 2000-06-02).
================================================
FILE: tasks/github/standard/build_your_own_x/find_commit_date/meta.json
================================================
{
"task_id": "find_commit_date",
"task_name": "Find Commit Date",
"category_id": "build_your_own_x",
"category_name": "Build Your Own X",
"description": "Find when Voxel Engine entries were first created by Daniel Stefanovic and document the date.",
"author": "Xiangyan Liu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"repository analysis"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/build-your-own-x",
"stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x"
}
}
================================================
FILE: tasks/github/standard/build_your_own_x/find_commit_date/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
import base64
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "build-your-own-x"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "build-your-own-x",
ref: str = "master",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def verify_task() -> bool:
"""Verify the find commit data task for Voxel Engine entries."""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("Verifying Voxel Engine commit date task...")
# 1. Check if ANSWER.md exists in the repository
print("1. Checking if ANSWER.md exists...")
content = _get_file_content("ANSWER.md", headers, github_org)
if not content:
print("Error: ANSWER.md not found in repository", file=sys.stderr)
return False
print("✓ ANSWER.md found")
# 2. Check the content format
print("2. Checking content format...")
content = content.strip()
# The expected date when Daniel Stefanovic added Voxel Engine entries
# Based on historical records, this should be 2018-07-07
expected_date = "2018-07-07"
# Check if the content matches the expected date format (YYYY-MM-DD)
import re
date_pattern = r'^\d{4}-\d{2}-\d{2}$'
if not re.match(date_pattern, content):
print(f"Error: Invalid date format. Expected YYYY-MM-DD, got: {content}", file=sys.stderr)
return False
print("✓ Date format is correct")
# 3. Verify the date is correct
print("3. Verifying the date...")
if content != expected_date:
print(f"Error: Incorrect date. Expected {expected_date}, got: {content}", file=sys.stderr)
return False
print(f"✓ Date is correct: {content}")
# 4. Verify README.md contains Voxel Engine section
print("4. Checking if README.md contains Voxel Engine section...")
readme_content = _get_file_content("README.md", headers, github_org)
if not readme_content:
print("Error: README.md not found in repository", file=sys.stderr)
return False
if "Voxel Engine" not in readme_content:
print("Error: Voxel Engine section not found in README.md", file=sys.stderr)
return False
# Check for specific Voxel Engine entries
voxel_entries = [
"Let's Make a Voxel Engine",
"Java Voxel Engine Tutorial"
]
for entry in voxel_entries:
if entry not in readme_content:
print(f"Warning: Voxel Engine entry '{entry}' not found in README.md", file=sys.stderr)
print("✓ Voxel Engine section found in README.md")
print("\n✅ All verification checks passed!")
print("Task completed successfully:")
print(f" - ANSWER.md created with date: {content}")
print(" - Date format is correct (YYYY-MM-DD)")
print(" - Date matches expected creation date for Voxel Engine entries by Daniel Stefanovic")
print(" - Voxel Engine section exists in README.md")
return True
if __name__ == "__main__":
success = verify_task()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/build_your_own_x/find_rag_commit/description.md
================================================
Find out the specific commit SHA of adding an entry about "RAG for Document Search". After finding this information, create an ANSWER.md file in the repository with the content being the commit SHA (e.g., 023dfa35694db2709057488ad338afdbc89fb226).
Hint: It should be in an "AI model" section I think.
================================================
FILE: tasks/github/standard/build_your_own_x/find_rag_commit/meta.json
================================================
{
"task_id": "find_rag_commit",
"task_name": "Find Rag Commit",
"category_id": "build_your_own_x",
"category_name": "Build Your Own X",
"description": "Identify the specific commit SHA that added the RAG for Document Search entry to the repository.",
"author": "Xiangyan Liu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"repository analysis"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/build-your-own-x",
"stateOriginalUrl": "https://github.com/codecrafters-io/build-your-own-x"
}
}
================================================
FILE: tasks/github/standard/build_your_own_x/find_rag_commit/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
import base64
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "build-your-own-x"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "build-your-own-x",
ref: str = "master",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def verify_task() -> bool:
"""Verify the find RAG commit SHA task."""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("Verifying RAG commit SHA task...")
# Expected commit SHA for RAG for Document Search
expected_sha = "048cd3b3de70e4b429057891576ea394a50cdf48"
# 1. Check if ANSWER.md exists in the repository
print("1. Checking if ANSWER.md exists...")
content = _get_file_content("ANSWER.md", headers, github_org)
if not content:
print("Error: ANSWER.md not found in repository", file=sys.stderr)
return False
print("✓ ANSWER.md found")
# 2. Check the content matches expected SHA
print("2. Checking commit SHA...")
content = content.strip()
if content != expected_sha:
print(f"Error: Incorrect commit SHA. Expected {expected_sha}, got: {content}", file=sys.stderr)
return False
print("✓ Commit SHA is correct")
# 3. Verify the commit exists
print("3. Verifying the commit exists...")
success, commit_data = _get_github_api(f"commits/{content}", headers, github_org)
if not success or not commit_data:
print(f"Error: Commit {content} not found in repository", file=sys.stderr)
return False
print(f"✓ Commit {content} exists")
print("\n✅ All verification checks passed!")
print("Task completed successfully:")
print(f" - ANSWER.md created with correct commit SHA: {content}")
print(f" - Commit exists in the repository")
print(f" - Commit message: {commit_data.get('commit', {}).get('message', '')}")
return True
if __name__ == "__main__":
success = verify_task()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/claude-code/automated_changelog_generation/description.md
================================================
I need you to analyze all recently closed issues and open pull requests in the repository, then generate comprehensive documentation and organize them properly.
**Step 1: Create Documentation Branch**
Create a new branch called 'docs/changelog-and-migration' from the main branch.
**Step 2: Generate Changelog from Closed Issues**
Find all closed issues in the repository and create the file `CHANGELOG-GENERATED.md` on your branch with:
- A heading "# Changelog - Recent Fixes"
- A "### 🐛 Bug Fixes" section listing all closed issues with bug label, formatted as: "- **#[NUMBER]**: [Title] ([labels])"
- A "### 📚 Documentation" section for closed issues with documentation label
- A "### 🔄 Duplicates" section for issues marked as duplicate
- A "### 📊 Statistics" section with:
- Total number of closed issues
- Distribution by platform labels (platform:macos, platform:linux, etc.)
- Distribution by area labels (area:core, area:tools, etc.)
**Step 3: Create Migration Guide for Open PRs**
Analyze all open pull requests and create the file `docs/MIGRATION_GUIDE.md` with:
- A heading "# Migration Guide for Pending Features"
- For each open PR, create a section with:
- PR title and number
- Summary of changes based on the PR description
- Any new configuration or environment variables mentioned
- Installation or usage instructions if applicable
**Step 4: Create Issue Analysis Report**
Create the file `reports/ISSUE_ANALYSIS.md` with:
- A heading "# Issue Analysis Report"
- A "## Closed Issues by Category" section grouping closed issues by their primary label
- A "## Resolution Patterns" section identifying common themes
- A "## Platform Impact Analysis" section showing which platforms were most affected
- Include references to specific issues that had cross-project impact or memory-related problems
**Step 5: Create PR Integration Plan**
Create the file `reports/PR_INTEGRATION_PLAN.md` with:
- A heading "# Pull Request Integration Strategy"
- A "## Open PRs Overview" section listing each open PR with a technical summary
- A "## Dependencies and Conflicts" section analyzing potential conflicts between PRs
- A "## Recommended Merge Order" section with reasoning
- A "## Risk Assessment" section linking any risks to previously closed issues
**Step 6: Create Documentation PR**
Create a pull request from 'docs/changelog-and-migration' to 'main' with:
- Title: "docs: Generated changelog and migration documentation"
- Body including:
- A "## Summary" section describing what was generated
- A "## Files Created" section listing all new documentation
- A "## Issues Processed" section mentioning the number of closed issues analyzed
- A "## PRs Analyzed" section mentioning the open PRs reviewed
**Step 7: Merge Documentation PR**
Merge the documentation pull request using the "squash" merge method.
================================================
FILE: tasks/github/standard/claude-code/automated_changelog_generation/meta.json
================================================
{
"task_id": "automated_changelog_generation",
"task_name": "Automated Changelog Generation",
"category_id": "claude-code",
"category_name": "Claude Code",
"description": "Analyze closed issues and open PRs to generate comprehensive documentation including changelog, migration guide, and analysis reports.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"release coordination",
"workflow automation"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/claude-code",
"stateOriginalUrl": "https://github.com/anthropics/claude-code"
}
}
================================================
FILE: tasks/github/standard/claude-code/automated_changelog_generation/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_branch_exists(
branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> bool:
"""Verify that a branch exists in the repository."""
success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
return success
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "claude-code",
ref: str = "main",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def _find_pr_by_title_keyword(
keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
"""Find a PR by title keyword and return the PR data."""
for state in ["open", "closed"]:
success, prs = _get_github_api(
f"pulls?state={state}&per_page=100", headers, org, repo
)
if success and prs:
for pr in prs:
if keyword.lower() in pr.get("title", "").lower():
return pr
return None
def _get_pr_merge_commit(
pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
"""Get the merge commit for a PR to check merge method."""
success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo)
if success and pr:
merge_commit_sha = pr.get("merge_commit_sha")
if merge_commit_sha:
success, commit = _get_github_api(
f"commits/{merge_commit_sha}", headers, org, repo
)
if success:
return commit
return None
def _check_file_sections(content: str, required_sections: List[str]) -> bool:
"""Check if file content contains required sections."""
if not content:
return False
return all(section in content for section in required_sections)
def _check_issue_references(text: str, issue_numbers: List[int]) -> int:
"""Count how many of the specified issue numbers are referenced in the text."""
if not text:
return 0
count = 0
for num in issue_numbers:
if f"#{num}" in text:
count += 1
return count
def _check_pr_references(text: str, pr_numbers: List[int]) -> int:
"""Count how many of the specified PR numbers are referenced in the text."""
if not text:
return 0
count = 0
for num in pr_numbers:
if f"#{num}" in text or f"PR #{num}" in text:
count += 1
return count
def verify() -> bool:
"""
Programmatically verify that the changelog and migration documentation workflow
meets the requirements described in description.md.
"""
# Configuration constants - these are known to us but not explicitly told to the model
DOCS_BRANCH_NAME = "docs/changelog-and-migration"
DOCS_PR_KEYWORD = "Generated changelog and migration"
# Known issue and PR numbers for verification
EXPECTED_BUG_ISSUES = [12, 13, 15, 21, 22, 23, 25, 37, 39, 48, 50]
EXPECTED_OPEN_PRS = [51, 52, 53]
# Expected file sections
CHANGELOG_SECTIONS = [
"# Changelog - Recent Fixes",
"### 🐛 Bug Fixes",
"### 📚 Documentation",
"### 🔄 Duplicates",
"### 📊 Statistics",
]
MIGRATION_GUIDE_SECTIONS = ["# Migration Guide for Pending Features"]
ISSUE_ANALYSIS_SECTIONS = [
"# Issue Analysis Report",
"## Closed Issues by Category",
"## Resolution Patterns",
"## Platform Impact Analysis",
]
PR_INTEGRATION_SECTIONS = [
"# Pull Request Integration Strategy",
"## Open PRs Overview",
"## Dependencies and Conflicts",
"## Recommended Merge Order",
"## Risk Assessment",
]
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying changelog and migration documentation workflow...")
# 1. Check that documentation branch exists
print("1. Verifying documentation branch exists...")
if not _check_branch_exists(DOCS_BRANCH_NAME, headers, github_org):
print(f"Error: Branch '{DOCS_BRANCH_NAME}' not found", file=sys.stderr)
return False
print("✓ Documentation branch created")
# 2. Check changelog file
print("2. Verifying CHANGELOG-GENERATED.md...")
changelog_content = _get_file_content(
"CHANGELOG-GENERATED.md", headers, github_org, "claude-code", DOCS_BRANCH_NAME
)
if not changelog_content:
print("Error: CHANGELOG-GENERATED.md not found", file=sys.stderr)
return False
if not _check_file_sections(changelog_content, CHANGELOG_SECTIONS):
print(
"Error: CHANGELOG-GENERATED.md missing required sections", file=sys.stderr
)
return False
# Check that bug issues are referenced
bug_refs = _check_issue_references(changelog_content, EXPECTED_BUG_ISSUES)
if bug_refs < 8: # At least 8 of the bug issues
print(
f"Error: CHANGELOG-GENERATED.md only references {bug_refs} bug issues, expected at least 8",
file=sys.stderr,
)
return False
# Check for platform and area statistics
if (
"platform:" not in changelog_content.lower()
or "area:" not in changelog_content.lower()
):
print(
"Error: CHANGELOG-GENERATED.md missing platform or area distribution",
file=sys.stderr,
)
return False
print("✓ Changelog created with proper content")
# 3. Check migration guide
print("3. Verifying MIGRATION_GUIDE.md...")
migration_content = _get_file_content(
"docs/MIGRATION_GUIDE.md", headers, github_org, "claude-code", DOCS_BRANCH_NAME
)
if not migration_content:
print("Error: docs/MIGRATION_GUIDE.md not found", file=sys.stderr)
return False
if not _check_file_sections(migration_content, MIGRATION_GUIDE_SECTIONS):
print("Error: MIGRATION_GUIDE.md missing required sections", file=sys.stderr)
return False
# Check that all expected open PRs are mentioned
pr_refs = _check_pr_references(migration_content, EXPECTED_OPEN_PRS)
if pr_refs < 3:
print(
f"Error: MIGRATION_GUIDE.md only references {pr_refs}/3 open PRs",
file=sys.stderr,
)
return False
print("✓ Migration guide created with proper content")
# 4. Check issue analysis report
print("4. Verifying ISSUE_ANALYSIS.md...")
issue_analysis_content = _get_file_content(
"reports/ISSUE_ANALYSIS.md",
headers,
github_org,
"claude-code",
DOCS_BRANCH_NAME,
)
if not issue_analysis_content:
print("Error: reports/ISSUE_ANALYSIS.md not found", file=sys.stderr)
return False
if not _check_file_sections(issue_analysis_content, ISSUE_ANALYSIS_SECTIONS):
print("Error: ISSUE_ANALYSIS.md missing required sections", file=sys.stderr)
return False
# Check for cross-project and memory issue mentions
if "#50" not in issue_analysis_content and "#48" not in issue_analysis_content:
print(
"Warning: ISSUE_ANALYSIS.md may be missing cross-project issue references",
file=sys.stderr,
)
print("✓ Issue analysis report created")
# 5. Check PR integration plan
print("5. Verifying PR_INTEGRATION_PLAN.md...")
pr_plan_content = _get_file_content(
"reports/PR_INTEGRATION_PLAN.md",
headers,
github_org,
"claude-code",
DOCS_BRANCH_NAME,
)
if not pr_plan_content:
print("Error: reports/PR_INTEGRATION_PLAN.md not found", file=sys.stderr)
return False
if not _check_file_sections(pr_plan_content, PR_INTEGRATION_SECTIONS):
print(
"Error: PR_INTEGRATION_PLAN.md missing required sections", file=sys.stderr
)
return False
# Check that all open PRs are analyzed
pr_refs_in_plan = _check_pr_references(pr_plan_content, EXPECTED_OPEN_PRS)
if pr_refs_in_plan < 3:
print(
f"Error: PR_INTEGRATION_PLAN.md only references {pr_refs_in_plan}/3 open PRs",
file=sys.stderr,
)
return False
print("✓ PR integration plan created")
# 6. Find and verify the documentation PR
print("6. Verifying documentation pull request...")
docs_pr = _find_pr_by_title_keyword(DOCS_PR_KEYWORD, headers, github_org)
if not docs_pr:
# Try alternative keyword
docs_pr = _find_pr_by_title_keyword(
"changelog and migration", headers, github_org
)
if not docs_pr:
print("Error: Documentation PR not found", file=sys.stderr)
return False
pr_body = docs_pr.get("body", "")
pr_number = docs_pr.get("number")
# Check PR body sections
required_sections = [
"## Summary",
"## Files Created",
"## Issues Processed",
"## PRs Analyzed",
]
missing_sections = []
for section in required_sections:
if section not in pr_body:
missing_sections.append(section)
if len(missing_sections) > 1: # Allow 1 missing section for flexibility
print(
f"Error: Documentation PR missing sections: {missing_sections}",
file=sys.stderr,
)
return False
print("✓ Documentation PR created")
# 7. Check that the documentation PR has been merged with squash method
print("7. Verifying documentation PR merge with squash method...")
if docs_pr.get("state") != "closed" or not docs_pr.get("merged_at"):
print("Error: Documentation PR has not been merged", file=sys.stderr)
return False
# Check merge method was squash by examining the merge commit
merge_commit = _get_pr_merge_commit(pr_number, headers, github_org)
if merge_commit:
# Squash merges typically have only one parent (the base branch)
parents = merge_commit.get("parents", [])
if len(parents) != 1:
print(
f"Warning: Merge commit has {len(parents)} parents, may not be squash merge",
file=sys.stderr,
)
# Check commit message pattern typical of squash merges
commit_message = merge_commit.get("commit", {}).get("message", "")
if f"#{pr_number}" not in commit_message:
print(
"Warning: Merge commit message may not follow squash merge pattern",
file=sys.stderr,
)
else:
print("Warning: Could not retrieve merge commit details", file=sys.stderr)
merged_at = docs_pr.get("merged_at")
if not merged_at:
print("Error: Documentation PR merge timestamp not found", file=sys.stderr)
return False
print("✓ Documentation PR merged successfully")
print("\n✅ All verification checks passed!")
print("Changelog and migration documentation completed successfully:")
print(f" - Documentation PR #{pr_number} (merged)")
print(f" - Branch: {DOCS_BRANCH_NAME}")
print(" - Files created: 4 documentation files")
print(f" - Bug issues referenced: {bug_refs}/{len(EXPECTED_BUG_ISSUES)}")
print(f" - Open PRs analyzed: {pr_refs}/{len(EXPECTED_OPEN_PRS)}")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/description.md
================================================
I need you to analyze the collaboration patterns between human developers and Claude (the AI assistant) in the repository by examining all available commit history, then create a comprehensive analysis report and submit it as a new file to the repository.
**Step 1: Commit History Analysis**
Analyze ALL commits in the repository to identify:
1. **Claude Co-Authored Commits**: Find all commits that were co-authored by Claude (look for "Co-Authored-By: Claude " in commit messages)
2. **Top Claude Collaborators**: Identify the top 3 human developers who most frequently collaborated with Claude
**Step 2: Create Collaboration Analysis Report**
Create a file called `CLAUDE_COLLABORATION_ANALYSIS.md` in the repository root with:
- A "# Claude AI Collaboration Analysis" title
- A "## Summary Statistics" section with these exact format requirements:
- "Total commits analyzed: [NUMBER]"
- "Number of Claude co-authored commits found: [NUMBER]"
- "Percentage of commits with Claude collaboration: [NUMBER]%"
- "Number of unique human collaborators who worked with Claude: [NUMBER]"
- A "## Top Claude Collaborators" section with this exact table format:
```markdown
| Developer | GitHub Username | Claude Collaborations |
|-----------|----------------|----------------------|
```
Include the top 3 developers by number of Claude collaborations.
**Step 3: Commit Analysis to Repository**
Commit the `CLAUDE_COLLABORATION_ANALYSIS.md` file to the main branch with:
- Commit message: "Add Claude AI collaboration analysis report"
- Ensure all statistics are accurate based on actual commit data
================================================
FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/meta.json
================================================
{
"task_id": "claude_collaboration_analysis",
"task_name": "Claude Collaboration Analysis",
"category_id": "claude-code",
"category_name": "Claude Code",
"description": "Analyze Claude AI collaboration patterns in commit history and create a comprehensive report of co-authored commits and top collaborators.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"repository analysis"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/claude-code",
"stateOriginalUrl": "https://github.com/anthropics/claude-code"
}
}
================================================
FILE: tasks/github/standard/claude-code/claude_collaboration_analysis/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
import re
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "claude-code",
ref: str = "main",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def _parse_summary_statistics(content: str) -> Dict:
"""Parse the summary statistics section from the report."""
stats = {}
lines = content.split("\n")
in_summary = False
for line in lines:
if "## Summary Statistics" in line:
in_summary = True
continue
if in_summary:
if "##" in line and "Summary Statistics" not in line:
break
# Parse statistics lines
if "Total commits analyzed" in line:
match = re.search(r"(\d+)", line)
if match:
stats["total_analyzed"] = int(match.group(1))
elif "Number of Claude co-authored commits" in line:
match = re.search(r"(\d+)", line)
if match:
stats["claude_commits"] = int(match.group(1))
elif "Percentage of commits with Claude collaboration" in line:
match = re.search(r"([\d.]+)%", line)
if match:
stats["percentage"] = float(match.group(1))
elif "Number of unique human collaborators" in line:
match = re.search(r"(\d+)", line)
if match:
stats["unique_collaborators"] = int(match.group(1))
return stats
def _parse_collaborators_table(content: str) -> List[Dict]:
"""Parse the top collaborators table from the report."""
collaborators = []
lines = content.split("\n")
in_table = False
for line in lines:
if "| Developer | GitHub Username | Claude Collaborations |" in line:
in_table = True
continue
if in_table and line.startswith("|---"):
continue
if in_table and line.startswith("|"):
parts = [p.strip() for p in line.split("|")]
if len(parts) >= 4: # Should have 3 columns plus empty parts
developer = parts[1].strip()
username = parts[2].strip()
collaborations = parts[3].strip()
if developer and username and collaborations:
try:
collaborators.append(
{
"developer": developer,
"username": username,
"collaborations": int(collaborations),
}
)
except ValueError:
pass
if in_table and line and not line.startswith("|") and "##" in line:
break
return collaborators
def verify_task() -> bool:
"""Verify the Claude collaboration analysis task."""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Pre-computed expected values based on repository analysis
# These are the correct answers the agent should find
EXPECTED_TOP_COLLABORATORS = [
{
"username": "bcherny",
"min_collaborations": 14,
}, # Boris Cherny has many Claude collaborations
{"username": "ashwin-ant", "min_collaborations": 5}, # Ashwin Bhat has some
{"username": "ant-kurt", "min_collaborations": 3}, # Kurt Carpenter has several
]
# Expected exact values for summary statistics
EXPECTED_STATS = {
"total_analyzed": 158,
"claude_commits": 25,
"percentage": 15.82,
"unique_collaborators": 6,
}
print("Verifying Claude collaboration analysis task...")
# 1. Check if CLAUDE_COLLABORATION_ANALYSIS.md exists in main branch
print("1. Checking if CLAUDE_COLLABORATION_ANALYSIS.md exists...")
content = _get_file_content("CLAUDE_COLLABORATION_ANALYSIS.md", headers, github_org)
if not content:
print(
"Error: CLAUDE_COLLABORATION_ANALYSIS.md not found in main branch",
file=sys.stderr,
)
return False
print("✓ CLAUDE_COLLABORATION_ANALYSIS.md found")
# 2. Check required sections exist
print("2. Checking required sections...")
required_sections = [
"# Claude AI Collaboration Analysis",
"## Summary Statistics",
"## Top Claude Collaborators",
]
for section in required_sections:
if section not in content:
print(f"Error: Missing required section '{section}'", file=sys.stderr)
return False
print("✓ All required sections present")
# 3. Parse and validate summary statistics
print("3. Validating summary statistics...")
stats = _parse_summary_statistics(content)
if "total_analyzed" not in stats:
print("Error: Total commits analyzed not found", file=sys.stderr)
return False
# Check exact values against expected statistics
if stats.get("total_analyzed") != EXPECTED_STATS["total_analyzed"]:
print(
f"Error: Total analyzed should be {EXPECTED_STATS['total_analyzed']}, found {stats.get('total_analyzed')}",
file=sys.stderr,
)
return False
if stats.get("claude_commits") != EXPECTED_STATS["claude_commits"]:
print(
f"Error: Claude commits should be {EXPECTED_STATS['claude_commits']}, found {stats.get('claude_commits')}",
file=sys.stderr,
)
return False
# Allow 0.1% tolerance for percentage
expected_percentage = EXPECTED_STATS["percentage"]
actual_percentage = stats.get("percentage", 0)
if abs(actual_percentage - expected_percentage) > 0.1:
print(
f"Error: Percentage should be around {expected_percentage}% (±0.1%), found {actual_percentage}%",
file=sys.stderr,
)
return False
if stats.get("unique_collaborators") != EXPECTED_STATS["unique_collaborators"]:
print(
f"Error: Unique collaborators should be {EXPECTED_STATS['unique_collaborators']}, found {stats.get('unique_collaborators')}",
file=sys.stderr,
)
return False
print("✓ Summary statistics validated")
# 4. Validate top collaborators table
print("4. Validating top collaborators...")
collaborators = _parse_collaborators_table(content)
if len(collaborators) < 3:
print(
f"Error: Expected 3 top collaborators, found {len(collaborators)}",
file=sys.stderr,
)
return False
# Check that expected top collaborators are present
found_usernames = [c["username"] for c in collaborators]
# The top 3 should include at least 2 of our expected collaborators
expected_found = 0
for expected in EXPECTED_TOP_COLLABORATORS:
if expected["username"] in found_usernames[:3]:
expected_found += 1
# Also check they have reasonable collaboration counts
for collab in collaborators:
if collab["username"] == expected["username"]:
if collab["collaborations"] < expected["min_collaborations"]:
print(
f"Error: {expected['username']} should have at least {expected['min_collaborations']} collaborations, found {collab['collaborations']}",
file=sys.stderr,
)
return False
if expected_found < 2:
print(
f"Error: Expected to find at least 2 of the known top collaborators in top 3, found {expected_found}",
file=sys.stderr,
)
print(
f"Expected to see at least 2 of: {[e['username'] for e in EXPECTED_TOP_COLLABORATORS]}",
file=sys.stderr,
)
print(f"Found: {found_usernames[:3]}", file=sys.stderr)
return False
print("✓ Top collaborators validated")
# 5. Check commit message verification
print("5. Verifying commit message...")
success, latest_commits = _get_github_api(
"commits?per_page=10", headers, github_org
)
if not success:
print("Error: Failed to fetch recent commits", file=sys.stderr)
return False
# Look for commit with expected message
expected_commit_message = "Add Claude AI collaboration analysis report"
commit_found = False
for commit in latest_commits:
if commit["commit"]["message"].startswith(expected_commit_message):
commit_found = True
break
if not commit_found:
print(
f"Error: Expected commit message '{expected_commit_message}' not found in recent commits",
file=sys.stderr,
)
return False
print("✓ Commit message verified")
# 6. Additional validation: Check unique collaborators count
print("6. Final validation complete...")
print("✓ All statistics match expected values")
print("\n✅ All verification checks passed!")
print("Claude collaboration analysis completed successfully:")
print(" - File: CLAUDE_COLLABORATION_ANALYSIS.md created in main branch")
print(f" - Commits analyzed: {stats.get('total_analyzed', 'N/A')}")
print(f" - Claude collaborations found: {stats.get('claude_commits', 'N/A')}")
print(f" - Top collaborators identified: {len(collaborators)}")
print(" - All statistics verified")
print(" - Commit message verified")
return True
if __name__ == "__main__":
success = verify_task()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/description.md
================================================
I need you to implement a comprehensive critical issue hotfix workflow for the repository that demonstrates advanced PR management, selective merging, and issue resolution tracking.
**Step 1: Create Critical Bug Tracking Issue**
Create a new issue with:
- Title: "CRITICAL: Memory and Context Management Issues - Hotfix Tracking"
- Body must include:
- A "## Critical Issues" heading listing issues #49 and #46
- A "## Impact Assessment" heading describing user impact
- A "## Resolution Strategy" heading with planned approach
- References to existing issues #49, #46, and #47 using "#" notation
- Keywords: "memory exhaustion", "context auto-compact", "JavaScript heap", "hotfix priority"
**Step 2: Create Memory Optimization Hotfix Branch**
Create a new branch called 'hotfix/memory-optimization-v1.0.72' from the main branch.
**Step 3: Implement Memory Management Documentation**
On the hotfix branch, create the file `docs/MEMORY_OPTIMIZATION.md` with this exact content:
```markdown
# Memory Optimization Guide for Claude Code v1.0.72
## Overview
This document addresses critical memory issues identified in issues #49 and #46.
## Memory Management Issues
### Context Auto-Compact Problem (Issue #49)
- **Root Cause**: Context management stuck at 0% completion
- **Impact**: Tool becomes unusable on macOS platforms
- **Solution**: Implement progressive context cleanup with configurable thresholds
### JavaScript Heap Exhaustion (Issue #46)
- **Root Cause**: Memory allocation failure during large MCP operations
- **Impact**: Complete Claude Code crash requiring restart
- **Solution**: Add streaming data processing and garbage collection optimization
## Optimization Strategies
### Immediate Fixes
1. **Context Buffer Management**
- Implement 10MB default context buffer limit
- Add automatic context pruning at 80% threshold
- Enable manual context reset via `/memory-reset` command
2. **MCP Operation Streaming**
- Process large datasets in 1MB chunks
- Implement backpressure for MongoDB operations
- Add memory usage monitoring and alerts
### Configuration Options
```json
{
"memory": {
"contextBufferLimit": "10MB",
"autoCompactThreshold": 0.8,
"streamingChunkSize": "1MB",
"gcOptimization": true
}
}
```
## Related Issues
- Fixes issue #49: Context auto-compact functionality
- Addresses issue #46: JavaScript heap out of memory crashes
- Related to issue #47: Cross-project hook execution problems
```
```
**Step 4: Create Pull Request with Issue Cross-References**
Create a pull request from 'hotfix/memory-optimization-v1.0.72' to 'main' with:
- Title: "HOTFIX: Critical memory optimization for issues #49 and #46"
- Body must include:
- A "## Summary" heading describing the memory fixes
- A "## Critical Issues Addressed" heading listing specific problems
- A "## Documentation Changes" heading describing the new guide
- "Addresses #49" and "Addresses #46" pattern linking to existing issues
- Reference to your tracking issue using "Tracked in #[ISSUE_NUMBER]"
- Keywords: "memory optimization", "context management", "heap exhaustion", "v1.0.72 hotfix"
**Step 5: Update and Merge PR #51 (Statsig Logging)**
For the existing PR #51:
- Update the PR description to include technical implementation details
- Add a "## Technical Implementation" section mentioning "event logging integration"
- Add keywords: "workflow enhancement", "issue management automation", "logging consistency"
- Merge the PR using the squash merge method
**Step 6: Add Implementation Comment to Tracking Issue**
Add a comment to your original tracking issue with:
- Reference to your hotfix PR using "PR #[NUMBER]" pattern
- Reference to actions taken on PR #51
- Technical details about the memory optimization approach
- Keywords: "context buffer management", "streaming optimization", "progressive cleanup"
- Mention of configuration options and thresholds
**Step 7: Close Tracking Issue with Resolution Summary**
Close your tracking issue by updating its state to 'closed' with:
- A final comment summarizing completed actions
- Reference to merged PR #51 and pending hotfix PR
- Keywords: "hotfix deployment", "memory issues resolved", "documentation updated"
================================================
FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/meta.json
================================================
{
"task_id": "critical_issue_hotfix_workflow",
"task_name": "Critical Issue Hotfix Workflow",
"category_id": "claude-code",
"category_name": "Claude Code",
"description": "Implement a critical issue hotfix workflow for memory and context management issues with proper PR management and issue tracking.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"issue management",
"pr workflows"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/claude-code",
"stateOriginalUrl": "https://github.com/anthropics/claude-code"
}
}
================================================
FILE: tasks/github/standard/claude-code/critical_issue_hotfix_workflow/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_branch_exists(
branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> bool:
"""Verify that a branch exists in the repository."""
success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
return success
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "claude-code",
ref: str = "main",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def _find_issue_by_title_keyword(
keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
"""Find an issue by title keyword and return the issue data."""
# Check both open and closed issues
for state in ["open", "closed"]:
success, issues = _get_github_api(
f"issues?state={state}&per_page=100", headers, org, repo
)
if success and issues:
for issue in issues:
if keyword.lower() in issue.get("title", "").lower():
return issue
return None
def _find_pr_by_title_keyword(
keyword: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
"""Find a PR by title keyword and return the PR data."""
# Check both open and closed PRs
for state in ["open", "closed"]:
success, prs = _get_github_api(
f"pulls?state={state}&per_page=100", headers, org, repo
)
if success and prs:
for pr in prs:
if keyword.lower() in pr.get("title", "").lower():
return pr
return None
def _get_pr_by_number(
pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Optional[Dict]:
"""Get a specific PR by number."""
success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo)
if success:
return pr
return None
def _check_issue_references(text: str, reference_numbers: List[str]) -> bool:
"""Check if text contains references to specified issue numbers."""
if not text:
return False
return all(f"#{ref}" in text for ref in reference_numbers)
def _check_addresses_pattern(pr_body: str, issue_numbers: List[str]) -> bool:
"""Check if PR body contains 'Addresses #X' pattern for specified issues."""
if not pr_body:
return False
return all(
f"Addresses #{num}" in pr_body or f"addresses #{num}" in pr_body
for num in issue_numbers
)
def _get_issue_comments(
issue_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> List[Dict]:
"""Get all comments for an issue."""
success, comments = _get_github_api(
f"issues/{issue_number}/comments", headers, org, repo
)
if success and comments:
return comments
return []
def _get_pr_reviews(
pr_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> List[Dict]:
"""Get all reviews for a PR."""
success, reviews = _get_github_api(f"pulls/{pr_number}/reviews", headers, org, repo)
if success and reviews:
return reviews
return []
def _check_title_keywords(title: str, required_keywords: List[str]) -> bool:
"""Check if title contains all required keywords."""
return all(keyword.lower() in title.lower() for keyword in required_keywords)
def _check_headings_and_keywords(
body: str, headings: List[str], keywords: List[str]
) -> bool:
"""Check if body contains required headings and keywords."""
has_headings = all(heading in body for heading in headings)
has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
return has_headings and has_keywords
def _check_exact_file_content(content: str, expected_sections: List[str]) -> bool:
"""Check if file content contains expected sections."""
return all(section in content for section in expected_sections)
def verify() -> bool:
"""
Programmatically verify that the critical issue hotfix workflow meets the
requirements described in description.md.
"""
# Configuration constants
HOTFIX_BRANCH_NAME = "hotfix/memory-optimization-v1.0.72"
TRACKING_ISSUE_KEYWORD = "Memory and Context Management Issues"
HOTFIX_PR_KEYWORD = "HOTFIX: Critical memory optimization"
# Expected file content sections
MEMORY_DOC_SECTIONS = [
"# Memory Optimization Guide for Claude Code v1.0.72",
"## Overview",
"### Context Auto-Compact Problem (Issue #49)",
"### JavaScript Heap Exhaustion (Issue #46)",
"## Optimization Strategies",
"### Immediate Fixes",
"### Configuration Options",
"## Related Issues",
]
# Issue content requirements
TRACKING_ISSUE_TITLE_KEYWORDS = [
"CRITICAL",
"Memory",
"Context Management",
"Hotfix Tracking",
]
TRACKING_ISSUE_REFERENCE_NUMBERS = ["49", "46", "47"]
TRACKING_ISSUE_HEADINGS = [
"## Critical Issues",
"## Impact Assessment",
"## Resolution Strategy",
]
TRACKING_ISSUE_KEYWORDS = [
"memory exhaustion",
"context auto-compact",
"JavaScript heap",
"hotfix priority",
]
# PR content requirements
HOTFIX_PR_TITLE_KEYWORDS = [
"HOTFIX",
"Critical memory optimization",
"issues #49",
"#46",
]
HOTFIX_PR_ADDRESSES_NUMBERS = ["49", "46"]
HOTFIX_PR_HEADINGS = [
"## Summary",
"## Critical Issues Addressed",
"## Documentation Changes",
]
HOTFIX_PR_KEYWORDS = [
"memory optimization",
"context management",
"heap exhaustion",
"v1.0.72 hotfix",
]
# PR #51 update requirements
PR51_UPDATE_KEYWORDS = [
"Technical Implementation",
"event logging integration",
"workflow enhancement",
]
# Issue comment requirements
ISSUE_COMMENT_KEYWORDS = [
"context buffer management",
"streaming optimization",
"progressive cleanup",
]
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying critical issue hotfix workflow completion...")
# 1. Check that hotfix branch exists
print("1. Verifying hotfix branch exists...")
if not _check_branch_exists(HOTFIX_BRANCH_NAME, headers, github_org):
print(f"Error: Branch '{HOTFIX_BRANCH_NAME}' not found", file=sys.stderr)
return False
print("✓ Hotfix branch created")
# 2. Check that the memory optimization documentation exists with exact content
print("2. Verifying MEMORY_OPTIMIZATION.md documentation...")
memory_doc_content = _get_file_content(
"docs/MEMORY_OPTIMIZATION.md",
headers,
github_org,
"claude-code",
HOTFIX_BRANCH_NAME,
)
if not memory_doc_content:
print(
"Error: docs/MEMORY_OPTIMIZATION.md not found in hotfix branch",
file=sys.stderr,
)
return False
if not _check_exact_file_content(memory_doc_content, MEMORY_DOC_SECTIONS):
print(
"Error: MEMORY_OPTIMIZATION.md missing required sections or content",
file=sys.stderr,
)
return False
print("✓ Memory optimization documentation created with correct content")
# 3. Find and verify the tracking issue
print("3. Verifying tracking issue creation and content...")
tracking_issue = _find_issue_by_title_keyword(
TRACKING_ISSUE_KEYWORD, headers, github_org
)
if not tracking_issue:
print(
f"Error: Tracking issue with keyword '{TRACKING_ISSUE_KEYWORD}' not found",
file=sys.stderr,
)
return False
tracking_issue_number = tracking_issue.get("number")
tracking_issue_title = tracking_issue.get("title", "")
tracking_issue_body = tracking_issue.get("body", "")
# Check tracking issue title keywords
if not _check_title_keywords(tracking_issue_title, TRACKING_ISSUE_TITLE_KEYWORDS):
print("Error: Tracking issue title missing required keywords", file=sys.stderr)
return False
# Check tracking issue headings, content and references
if not _check_headings_and_keywords(
tracking_issue_body, TRACKING_ISSUE_HEADINGS, TRACKING_ISSUE_KEYWORDS
):
print(
"Error: Tracking issue missing required headings or keywords",
file=sys.stderr,
)
return False
if not _check_issue_references(
tracking_issue_body, TRACKING_ISSUE_REFERENCE_NUMBERS
):
print(
"Error: Tracking issue does not reference required issues #49, #46, #47",
file=sys.stderr,
)
return False
print("✓ Tracking issue created with correct content and references")
# 4. Find and verify the hotfix PR
print("4. Verifying hotfix pull request creation and content...")
hotfix_pr = _find_pr_by_title_keyword(HOTFIX_PR_KEYWORD, headers, github_org)
if not hotfix_pr:
print(
f"Error: Hotfix PR with keyword '{HOTFIX_PR_KEYWORD}' not found",
file=sys.stderr,
)
return False
hotfix_pr_number = hotfix_pr.get("number")
hotfix_pr_title = hotfix_pr.get("title", "")
hotfix_pr_body = hotfix_pr.get("body", "")
# Check hotfix PR title keywords
if not _check_title_keywords(hotfix_pr_title, HOTFIX_PR_TITLE_KEYWORDS):
print("Error: Hotfix PR title missing required keywords", file=sys.stderr)
return False
# Check hotfix PR headings and content
if not _check_headings_and_keywords(
hotfix_pr_body, HOTFIX_PR_HEADINGS, HOTFIX_PR_KEYWORDS
):
print("Error: Hotfix PR missing required headings or keywords", file=sys.stderr)
return False
# Check hotfix PR addresses pattern
if not _check_addresses_pattern(hotfix_pr_body, HOTFIX_PR_ADDRESSES_NUMBERS):
print(
"Error: Hotfix PR does not properly address issues #49 and #46",
file=sys.stderr,
)
return False
# Check reference to tracking issue
if f"#{tracking_issue_number}" not in hotfix_pr_body:
print(
f"Error: Hotfix PR does not reference tracking issue #{tracking_issue_number}",
file=sys.stderr,
)
return False
print("✓ Hotfix PR created with correct content and references")
# 5. Check PR #51 has been updated and merged
print("5. Verifying PR #51 update and merge...")
pr51 = _get_pr_by_number(51, headers, github_org)
if not pr51:
print("Error: PR #51 not found", file=sys.stderr)
return False
pr51_body = pr51.get("body", "")
pr51_state = pr51.get("state", "")
# Check PR #51 has been updated with required content
if not _check_headings_and_keywords(
pr51_body, ["## Technical Implementation"], PR51_UPDATE_KEYWORDS
):
print(
"Error: PR #51 missing updated technical implementation section",
file=sys.stderr,
)
return False
# Check PR #51 has been merged
if pr51_state != "closed" or not pr51.get("merged_at"):
print("Error: PR #51 has not been merged", file=sys.stderr)
return False
print("✓ PR #51 updated and merged successfully")
# 6. Check tracking issue has implementation comment
print("6. Verifying tracking issue implementation comment...")
tracking_issue_comments = _get_issue_comments(
tracking_issue_number, headers, github_org
)
has_implementation_comment = False
for comment in tracking_issue_comments:
body = comment.get("body", "")
has_pr_ref = f"PR #{hotfix_pr_number}" in body
has_pr51_ref = "PR #51" in body
has_keywords = all(
keyword.lower() in body.lower() for keyword in ISSUE_COMMENT_KEYWORDS
)
if has_pr_ref and has_pr51_ref and has_keywords:
has_implementation_comment = True
break
if not has_implementation_comment:
print(
f"Error: Tracking issue #{tracking_issue_number} missing implementation comment with required references and keywords",
file=sys.stderr,
)
return False
print("✓ Tracking issue has implementation comment with PR references")
# 7. Check tracking issue is closed
print("7. Verifying tracking issue closure...")
if tracking_issue.get("state") != "closed":
print(
f"Error: Tracking issue #{tracking_issue_number} is not closed",
file=sys.stderr,
)
return False
print("✓ Tracking issue closed successfully")
print("\n✅ All verification checks passed!")
print("Critical issue hotfix workflow completed successfully:")
print(f" - Tracking Issue #{tracking_issue_number}: {tracking_issue.get('title')}")
print(f" - Hotfix PR #{hotfix_pr_number}: {hotfix_pr.get('title')}")
print(f" - Branch: {HOTFIX_BRANCH_NAME}")
print(" - PR #51 merged: ✓")
print(" - Memory optimization documentation: ✓")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/claude-code/feature_commit_tracking/description.md
================================================
I need you to research the development history of the repository across multiple branches and commits, then create a comprehensive feature tracking document and submit it as a new file to the repository.
**Step 1: Multi-Branch Feature Investigation**
Research and identify the exact commit SHAs where these specific features were introduced by analyzing commits across different branches:
1. **Shell Completion Scripts**: Find when shell completion functionality was first added to the repository
2. **CHANGELOG Version 1.0.65**: Find when the changelog was updated to include version 1.0.65
3. **Rust Extraction Improvements**: Find when workflow improvements for Rust code extraction were implemented
**Step 2: Create Feature Tracking Documentation**
Create a file called `FEATURE_COMMITS.md` in the repository root with:
- A "# Feature Development Tracking" title
- A "## Overview" section explaining this tracks major feature additions across repository branches
- A "## Feature Commit History" section with this exact table format:
```markdown
| Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message |
|-------------|------------|---------|---------|------|---------------|----------------|
```
For each feature, populate the table with:
- Exact commit SHA (full 40-character hash)
- GitHub username of the commit author
- Branch where the commit was made
- Commit date in YYYY-MM-DD format
- Number of files changed in that commit
- First line of the commit message
**Step 3: Commit Documentation to Repository**
Commit the `FEATURE_COMMITS.md` file to the main branch with:
- Commit message: "Add feature development tracking documentation"
- Ensure the file is properly formatted markdown
- Verify all commit SHAs in the table are accurate and verifiable
The verification process will check that your table contains the correct commit SHAs for each specific feature, along with accurate author, branch, and date information.
================================================
FILE: tasks/github/standard/claude-code/feature_commit_tracking/meta.json
================================================
{
"task_id": "feature_commit_tracking",
"task_name": "Feature Commit Tracking",
"category_id": "claude-code",
"category_name": "Claude Code",
"description": "Research development history across branches to track when specific features were introduced and create comprehensive documentation.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"repository analysis",
"release coordination"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/claude-code",
"stateOriginalUrl": "https://github.com/anthropics/claude-code"
}
}
================================================
FILE: tasks/github/standard/claude-code/feature_commit_tracking/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
import re
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "claude-code",
ref: str = "main",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def _verify_commit_exists(
commit_sha: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
"""Verify that a commit exists and return its details."""
success, commit_data = _get_github_api(f"commits/{commit_sha}", headers, org, repo)
return success, commit_data
def _parse_feature_table(content: str) -> List[Dict]:
"""Parse the feature commit table from markdown content."""
features = []
lines = content.split("\n")
in_table = False
for line in lines:
# Look for table header
if (
"| Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message |"
in line
):
in_table = True
continue
if in_table and line.startswith("|---"):
continue
# Parse table rows
if in_table and line.startswith("|"):
parts = [p.strip() for p in line.split("|")]
if len(parts) >= 8: # Should have 7 columns plus empty parts at start/end
feature_name = parts[1].strip()
commit_sha = parts[2].strip()
author = parts[3].strip()
branch = parts[4].strip()
date = parts[5].strip()
files_changed = parts[6].strip()
commit_message = parts[7].strip()
if feature_name and commit_sha and author and branch and date:
features.append(
{
"name": feature_name,
"sha": commit_sha,
"author": author,
"branch": branch,
"date": date,
"files_changed": files_changed,
"commit_message": commit_message,
}
)
# Stop at end of table section
if in_table and line and not line.startswith("|") and "##" in line:
break
return features
def verify_task() -> bool:
"""Verify the feature commit tracking task."""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Expected feature commits based on exploration
expected_features = {
"Shell Completion Scripts": "8a0febdd09bda32f38c351c0881784460d69997d",
"CHANGELOG Version 1.0.65": "94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0",
"Rust Extraction Improvements": "50e58affdf1bfc7d875202bc040ebe0dcfb7d332",
}
# Expected authors for each commit
expected_authors = {
"8a0febdd09bda32f38c351c0881784460d69997d": "gitmpr",
"94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "QwertyJack",
"50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "alokdangre",
}
# Expected commit messages for each commit
expected_messages = {
"8a0febdd09bda32f38c351c0881784460d69997d": "feat: add shell completions (bash, zsh, fish)",
"94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "Merge branch 'anthropics:main' into main",
"50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "Enhance Rust extraction and output handling in workflows",
}
# Expected dates for each commit (YYYY-MM-DD format)
expected_dates = {
"8a0febdd09bda32f38c351c0881784460d69997d": "2025-08-01",
"94dcaca5d71ad82644ae97f3a2b0c5eb8b63eae0": "2025-08-02",
"50e58affdf1bfc7d875202bc040ebe0dcfb7d332": "2025-08-09",
}
print("Verifying feature commit tracking task...")
# 1. Check if FEATURE_COMMITS.md exists in main branch
print("1. Checking if FEATURE_COMMITS.md exists...")
content = _get_file_content("FEATURE_COMMITS.md", headers, github_org)
if not content:
print("Error: FEATURE_COMMITS.md not found in main branch", file=sys.stderr)
return False
print("✓ FEATURE_COMMITS.md found")
# 2. Check required sections exist
print("2. Checking required sections...")
required_sections = [
"# Feature Development Tracking",
"## Overview",
"## Feature Commit History",
]
for section in required_sections:
if section not in content:
print(f"Error: Missing required section '{section}'", file=sys.stderr)
return False
print("✓ All required sections present")
# 3. Parse and validate feature table
print("3. Parsing and validating feature table...")
features = _parse_feature_table(content)
if len(features) < 3:
print(
f"Error: Expected at least 3 features, found {len(features)}",
file=sys.stderr,
)
return False
# 4. Verify each expected feature is present with correct commit SHA
print("4. Verifying feature commit SHAs...")
found_features = {}
for feature in features:
found_features[feature["name"]] = feature["sha"]
for feature_name, expected_sha in expected_features.items():
if feature_name not in found_features:
print(
f"Error: Feature '{feature_name}' not found in table", file=sys.stderr
)
return False
actual_sha = found_features[feature_name]
if actual_sha != expected_sha:
print(
f"Error: Wrong SHA for '{feature_name}'. Expected: {expected_sha}, Got: {actual_sha}",
file=sys.stderr,
)
return False
print("✓ All feature commit SHAs are correct")
# 5. Verify each commit exists and has correct author
print("5. Verifying commit details...")
for feature in features:
if feature["sha"] in expected_features.values():
success, commit_data = _verify_commit_exists(
feature["sha"], headers, github_org
)
if not success:
print(f"Error: Commit {feature['sha']} not found", file=sys.stderr)
return False
# Check author
expected_author = expected_authors.get(feature["sha"])
if expected_author:
actual_author = commit_data.get("author", {}).get("login", "")
if actual_author != expected_author:
print(
f"Error: Wrong author for {feature['sha']}. Expected: {expected_author}, Got: {actual_author}",
file=sys.stderr,
)
return False
# Check commit message (compare with table entry)
expected_message = expected_messages.get(feature["sha"])
if expected_message and "commit_message" in feature:
if feature["commit_message"] != expected_message:
print(
f"Error: Wrong commit message in table for {feature['sha']}. Expected: '{expected_message}', Got: '{feature['commit_message']}'",
file=sys.stderr,
)
return False
# Also verify against actual commit data
if expected_message:
actual_message = (
commit_data.get("commit", {}).get("message", "").split("\n")[0]
) # First line only
if actual_message != expected_message:
print(
f"Error: Wrong commit message for {feature['sha']}. Expected: '{expected_message}', Got: '{actual_message}'",
file=sys.stderr,
)
return False
# Check date format (YYYY-MM-DD)
if not re.match(r"^\d{4}-\d{2}-\d{2}$", feature["date"]):
print(
f"Error: Invalid date format for {feature['name']}: {feature['date']}",
file=sys.stderr,
)
return False
# Check actual date matches expected
expected_date = expected_dates.get(feature["sha"])
if expected_date:
if feature["date"] != expected_date:
print(
f"Error: Wrong date for {feature['sha']}. Expected: {expected_date}, Got: {feature['date']}",
file=sys.stderr,
)
return False
print("✓ All commit details verified")
# 6. Verify the table format is correct
print("6. Verifying table format...")
table_header = "| Feature Name | Commit SHA | Author | Branch | Date | Files Changed | Commit Message |"
if table_header not in content:
print("Error: Table header format is incorrect", file=sys.stderr)
return False
# Check that all features have complete information
for feature in features:
if not all(
[
feature["name"],
feature["sha"],
feature["author"],
feature["branch"],
feature["date"],
feature.get("commit_message", ""),
]
):
print(
f"Error: Incomplete information for feature: {feature['name']}",
file=sys.stderr,
)
return False
print("✓ Table format is correct and complete")
print("\n✅ All verification checks passed!")
print("Feature commit tracking completed successfully:")
print(" - File: FEATURE_COMMITS.md created in main branch")
print(f" - Features tracked: {len(features)}")
print(" - All expected commit SHAs verified")
print(" - All commit authors verified")
print(" - Analysis summary complete")
return True
if __name__ == "__main__":
success = verify_task()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/claude-code/label_color_standardization/description.md
================================================
I need you to implement a comprehensive label documentation and organization workflow for the repository.
**Step 1: Create Label Documentation Issue**
Create a new issue with:
- Title containing: "Document label organization for better visual organization" and "label guide"
- Body must include:
- A "## Problem" heading describing the need for better label documentation
- A "## Proposed Solution" heading about creating a comprehensive label guide for different label categories
- A "## Benefits" heading listing improved visual organization and easier issue triage
- Keywords: "label documentation", "visual organization", "label guide", "organization"
- Labels: Initially add "enhancement" and "documentation" labels to the issue
**Step 2: Create Feature Branch**
Create a new branch called 'feat/label-color-guide' from main.
**Step 3: Create Label Documentation**
On the feature branch, create the file `docs/LABEL_COLORS.md` with:
- A "# Label Organization Guide" title
- A "## Label Categories" section with a table that MUST follow this exact format:
```markdown
| Label Name | Category | Description |
|------------|----------|-------------|
```
The table must include ALL existing labels in the repository. For each label:
- Group labels by category (e.g., issue-type, platform, area, status, performance)
- Include a description for each label
- A "## Usage Guidelines" section explaining when to use each label category
**Step 4: Apply ALL Labels to the Documentation Issue**
Update the issue you created in Step 1 by adding ALL existing labels from the repository. This serves as a visual demonstration of the label organization. The issue should have every single label that exists in the repository applied to it.
**Step 5: Create Pull Request**
Create a pull request from 'feat/label-color-guide' to 'main' with:
- Title containing: "Add label organization guide" and "visual organization"
- Body must include:
- A "## Summary" heading explaining the label organization documentation
- A "## Changes" heading with a bullet list of what was added
- "Fixes #[ISSUE_NUMBER]" pattern linking to your created issue
- A "## Verification" section stating that all labels have been documented
- Keywords: "label documentation", "organization guide", "visual improvement", "documentation"
- Labels: Add a reasonable subset of labels to the PR (at least 5-10 labels from different categories)
**Step 6: Document Changes in Issue**
Add a comment to the original issue with:
- Confirmation that the label documentation has been created
- Total count of labels documented
- Reference to the PR using "PR #[NUMBER]" pattern
- Keywords: "documentation created", "label guide complete", "organization complete"
================================================
FILE: tasks/github/standard/claude-code/label_color_standardization/meta.json
================================================
{
"task_id": "label_color_standardization",
"task_name": "Label Color Standardization",
"category_id": "claude-code",
"category_name": "Claude Code",
"description": "Standardize label colors from default gray to a comprehensive color scheme for better visual organization and issue triage.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"issue management",
"workflow automation"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/claude-code",
"stateOriginalUrl": "https://github.com/anthropics/claude-code"
}
}
================================================
FILE: tasks/github/standard/claude-code/label_color_standardization/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_branch_exists(
branch_name: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> bool:
"""Verify that a branch exists in the repository."""
success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
return success
def _check_file_content(
branch: str,
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "claude-code",
) -> Optional[str]:
"""Get file content from a branch."""
import base64
success, result = _get_github_api(
f"contents/{file_path}?ref={branch}", headers, org, repo
)
if not success or not result:
return None
if result.get("content"):
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
return None
def _parse_label_table(content: str) -> List[str]:
"""Parse the label table from markdown content and return label names."""
documented_labels = []
# Find the table in the content
lines = content.split("\n")
in_table = False
for line in lines:
# Skip header and separator lines
if "| Label Name | Category |" in line:
in_table = True
continue
if in_table and line.startswith("|---"):
continue
# Parse table rows
if in_table and line.startswith("|"):
parts = [p.strip() for p in line.split("|")]
if len(parts) >= 3: # Should have at least label, category
label_name = parts[1].strip()
if label_name:
documented_labels.append(label_name)
# Stop at end of table
if in_table and line and not line.startswith("|"):
break
return documented_labels
def _find_issue_by_title_keywords(
title_keywords: List[str],
headers: Dict[str, str],
org: str,
repo: str = "claude-code",
) -> Optional[Dict]:
"""Find an issue by title keywords and return the issue data."""
for state in ["open", "closed"]:
success, issues = _get_github_api(
f"issues?state={state}&per_page=100", headers, org, repo
)
if success and issues:
for issue in issues:
# Skip pull requests
if "pull_request" in issue:
continue
title = issue.get("title", "").lower()
if all(keyword.lower() in title for keyword in title_keywords):
return issue
return None
def _find_pr_by_title_keywords(
title_keywords: List[str],
headers: Dict[str, str],
org: str,
repo: str = "claude-code",
) -> Optional[Dict]:
"""Find a PR by title keywords and return the PR data."""
for state in ["open", "closed"]:
success, prs = _get_github_api(
f"pulls?state={state}&per_page=100", headers, org, repo
)
if success and prs:
for pr in prs:
title = pr.get("title", "").lower()
if all(keyword.lower() in title for keyword in title_keywords):
return pr
return None
def _get_issue_comments(
issue_number: int, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> List[Dict]:
"""Get all comments for an issue."""
success, comments = _get_github_api(
f"issues/{issue_number}/comments", headers, org, repo
)
if success and comments:
return comments
return []
def verify() -> bool:
"""
Programmatically verify that the label color standardization workflow meets the
requirements described in description.md.
"""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
# Configuration constants
BRANCH_NAME = "feat/label-color-guide"
# Issue requirements
ISSUE_TITLE_KEYWORDS = ["Document label organization", "label guide"]
ISSUE_KEYWORDS = [
"label documentation",
"visual organization",
"label guide",
"organization",
]
# PR requirements
PR_TITLE_KEYWORDS = ["label organization guide", "visual organization"]
PR_KEYWORDS = [
"label documentation",
"organization guide",
"visual improvement",
"documentation",
]
# All expected labels in the repository that are actually used/discoverable via MCP tools
# Note: Excludes 'wontfix', 'invalid', 'good first issue', 'help wanted' as they exist
# in the repository but are not used by any issues (not discoverable via MCP search)
ALL_EXPECTED_LABELS = [
"bug",
"enhancement",
"duplicate",
"question",
"documentation",
"platform:macos",
"platform:linux",
"platform:windows",
"area:core",
"area:tools",
"area:tui",
"area:ide",
"area:mcp",
"area:api",
"area:security",
"area:model",
"area:auth",
"area:packaging",
"has repro",
"memory",
"perf:memory",
"external",
]
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying label color standardization workflow completion...")
# 1. Check that feature branch exists
print("1. Verifying feature branch exists...")
if not _check_branch_exists(BRANCH_NAME, headers, github_org):
print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr)
return False
# 2. Check documentation file exists and has correct format
print("2. Verifying label documentation file...")
doc_content = _check_file_content(
BRANCH_NAME, "docs/LABEL_COLORS.md", headers, github_org
)
if not doc_content:
print("Error: docs/LABEL_COLORS.md not found", file=sys.stderr)
return False
# Parse the label table from documentation
documented_labels = _parse_label_table(doc_content)
if len(documented_labels) < 20:
print(
f"Error: Documentation table incomplete, found only {len(documented_labels)} labels",
file=sys.stderr,
)
return False
# 3. Verify labels are documented
print("3. Verifying expected labels are documented...")
print(f" ✓ {len(ALL_EXPECTED_LABELS)} expected labels defined for verification")
# 4. Find the created issue
print("4. Verifying issue creation...")
issue = _find_issue_by_title_keywords(ISSUE_TITLE_KEYWORDS, headers, github_org)
if not issue:
print(
"Error: Issue with title containing required keywords not found",
file=sys.stderr,
)
return False
issue_number = issue.get("number")
issue_body = issue.get("body", "")
# Check issue content has required sections and keywords
issue_required_sections = ["## Problem", "## Proposed Solution", "## Benefits"]
for section in issue_required_sections:
if section not in issue_body:
print(f"Error: Issue body missing required section: {section}", file=sys.stderr)
return False
# Check issue has required keywords
if not all(keyword.lower() in issue_body.lower() for keyword in ISSUE_KEYWORDS):
missing_keywords = [kw for kw in ISSUE_KEYWORDS if kw.lower() not in issue_body.lower()]
print(f"Error: Issue body missing required keywords: {missing_keywords}", file=sys.stderr)
return False
# Check issue has initial required labels (enhancement and documentation)
issue_label_names = [label["name"] for label in issue.get("labels", [])]
initial_required_labels = ["enhancement", "documentation"]
for required_label in initial_required_labels:
if required_label not in issue_label_names:
print(f"Error: Issue missing initial required label: {required_label}", file=sys.stderr)
return False
# 5. Find the created PR
print("5. Verifying pull request creation...")
pr = _find_pr_by_title_keywords(PR_TITLE_KEYWORDS, headers, github_org)
if not pr:
print(
"Error: PR with title containing required keywords not found",
file=sys.stderr,
)
return False
pr_number = pr.get("number")
pr_body = pr.get("body", "")
pr_labels = pr.get("labels", [])
# Check PR references issue with correct pattern
if f"Fixes #{issue_number}" not in pr_body and f"fixes #{issue_number}" not in pr_body:
print(f"Error: PR does not contain 'Fixes #{issue_number}' pattern", file=sys.stderr)
return False
# Check PR body has required sections and keywords
pr_required_sections = ["## Summary", "## Changes", "## Verification"]
for section in pr_required_sections:
if section not in pr_body:
print(f"Error: PR body missing required section: {section}", file=sys.stderr)
return False
# Check PR has required keywords
if not all(keyword.lower() in pr_body.lower() for keyword in PR_KEYWORDS):
missing_keywords = [kw for kw in PR_KEYWORDS if kw.lower() not in pr_body.lower()]
print(f"Error: PR body missing required keywords: {missing_keywords}", file=sys.stderr)
return False
# Check PR has sufficient labels (at least 5 from different categories)
if len(pr_labels) < 5:
print(f"Error: PR has only {len(pr_labels)} labels, needs at least 5", file=sys.stderr)
return False
# 6. Verify issue has ALL expected/usable labels applied (demonstrates organization)
print("6. Verifying issue has all expected labels applied...")
issue_label_names = [label["name"] for label in issue.get("labels", [])]
# Use our expected labels list instead of all repo labels (excludes unused labels)
expected_labels_to_check = ALL_EXPECTED_LABELS
missing_labels = []
for expected_label in expected_labels_to_check:
if expected_label not in issue_label_names:
missing_labels.append(expected_label)
if missing_labels:
print(
f"Error: Issue missing {len(missing_labels)} expected labels: {missing_labels[:5]}...",
file=sys.stderr,
)
return False
print(f" ✓ Issue has all {len(expected_labels_to_check)} expected labels applied")
# 7. Verify issue has comment documenting changes
print("7. Verifying issue comment with documentation...")
issue_comments = _get_issue_comments(issue_number, headers, github_org)
found_update_comment = False
comment_required_keywords = ["documentation created", "label guide complete", "organization complete"]
for comment in issue_comments:
body = comment.get("body", "")
# Check for PR reference and required keywords
if (f"PR #{pr_number}" in body and
any(keyword.lower() in body.lower() for keyword in comment_required_keywords) and
"total" in body.lower() and "labels" in body.lower()):
found_update_comment = True
break
if not found_update_comment:
print("Error: Issue missing comment documenting changes with required content", file=sys.stderr)
print(" Comment should include: PR reference, label count, and completion keywords", file=sys.stderr)
return False
# 8. Final verification of complete workflow
print("8. Final verification of workflow completion...")
# Skip repository label existence check - we trust that our expected labels
# are the ones actually discoverable/usable via MCP tools
# Ensure expected labels are documented (not all repo labels, since some are unused)
documented_label_count = len(documented_labels)
expected_label_count = len(ALL_EXPECTED_LABELS)
if documented_label_count < expected_label_count:
print(
f"Error: Documentation incomplete - {documented_label_count} documented vs {expected_label_count} expected",
file=sys.stderr,
)
return False
# Check that all expected labels are documented
missing_documented_labels = []
for expected_label in ALL_EXPECTED_LABELS:
if expected_label not in documented_labels:
missing_documented_labels.append(expected_label)
if missing_documented_labels:
print(
f"Error: Documentation missing expected labels: {missing_documented_labels}",
file=sys.stderr,
)
return False
print(f" ✓ All {expected_label_count} expected labels documented")
print(f" ✓ All {len(ALL_EXPECTED_LABELS)} expected labels present and documented")
print("\n✓ All verification checks passed!")
print("Label documentation workflow completed successfully:")
print(
f" - Issue #{issue_number}: {issue.get('title')} (with all {len(issue_label_names)} labels)"
)
print(f" - PR #{pr_number}: {pr.get('title')}")
print(f" - Branch: {BRANCH_NAME}")
print(" - Documentation: docs/LABEL_COLORS.md")
print(f" - {expected_label_count} labels documented for better organization")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/easyr1/advanced_branch_strategy/description.md
================================================
The EasyR1 repository has a critical production issue: all development happens directly on the `main` branch, which is extremely risky for a project with 25 active issues. A recent commit `098931530606d22f867fd121b1dcb3225a43661f` introduced protocol changes that need to be properly managed through a structured branching workflow. I need you to implement a complete GitFlow strategy by working through a realistic development scenario.
**The Scenario:** You're preparing for the v1.0.0 release while simultaneously handling a critical protocol serialization bug that was introduced in the recent data proto changes.
**Step 1: Initialize GitFlow Structure**
Create a `develop` branch from `main` as the new integration branch. Then create a `release/v1.0.0` branch from `develop` to prepare for the upcoming release.
**Step 2: Address the Critical Bug**
Create a `feature/protocol-serialization-fix` branch from `develop`. In this branch, create a new file called `PROTOCOL_FIXES.md` with the exact content:
```
# Protocol Serialization Fixes
## Critical Fix for Data Proto Issue
- Enhanced serialization safety check implemented
- Addresses issue from commit 098931530606d22f867fd121b1dcb3225a43661f
- Status: Ready for integration testing
```
**Step 3: Integrate the Fix Through Proper Workflow**
Create a pull request from `feature/protocol-serialization-fix` to `develop` to integrate the fix documentation. This demonstrates the feature → develop integration pattern.
**Step 4: Update Release Branch and CI/CD**
Merge the develop branch changes into `release/v1.0.0` branch to include the critical fix in the release.
**Step 5: Document the New Process**
Create an issue titled `Implement Advanced Branch Protection Strategy` with exactly these 3 checkboxes in the body:
- [ ] All development flows through develop branch
- [ ] Release preparation happens in release/v1.0.0 branch
- [ ] Feature integration uses PR workflow
Add the label `process-implementation` to this issue to track the process implementation.
================================================
FILE: tasks/github/standard/easyr1/advanced_branch_strategy/meta.json
================================================
{
"task_id": "advanced_branch_strategy",
"task_name": "Advanced Branch Strategy",
"category_id": "easyr1",
"category_name": "EasyR1",
"description": "Implement GitFlow branching strategy with develop, release, and feature branches to replace risky direct-to-main development.",
"author": "Xiangyan Liu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"pr workflows",
"release coordination"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/EasyR1",
"stateOriginalUrl": "https://github.com/hiyouga/EasyR1"
}
}
================================================
FILE: tasks/github/standard/easyr1/advanced_branch_strategy/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv
load_dotenv(".mcp_env")
def _get_github_api(
endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
github_org = os.environ.get("GITHUB_EVAL_ORG")
url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_gitflow_branches(headers: Dict[str, str]) -> bool:
"""Check if GitFlow branches are properly created from correct base branches."""
success, branches_data = _get_github_api("branches", headers)
if not success or not branches_data:
print("Error: Could not fetch branches", file=sys.stderr)
return False
existing_branches = [branch.get("name", "") for branch in branches_data]
required_branches = [
"develop",
"release/v1.0.0",
"feature/protocol-serialization-fix",
]
for branch in required_branches:
if branch not in existing_branches:
print(f"Error: Required branch '{branch}' not found", file=sys.stderr)
return False
return True
def _check_protocol_fixes_file(headers: Dict[str, str]) -> bool:
"""Check if PROTOCOL_FIXES.md file exists in feature branch with correct content."""
success, file_data = _get_github_api(
"contents/PROTOCOL_FIXES.md?ref=feature/protocol-serialization-fix", headers
)
if not success or not file_data:
print("Error: PROTOCOL_FIXES.md not found in feature branch", file=sys.stderr)
return False
# Decode base64 content
import base64
content = base64.b64decode(file_data.get("content", "")).decode("utf-8")
# Check for required content elements
required_elements = [
"# Protocol Serialization Fixes",
"## Critical Fix for Data Proto Issue",
"Enhanced serialization safety check implemented",
"098931530606d22f867fd121b1dcb3225a43661f",
"Status: Ready for integration testing",
]
for element in required_elements:
if element not in content:
print(
f"Error: PROTOCOL_FIXES.md missing required content: {element}",
file=sys.stderr,
)
return False
return True
def _check_integration_workflow(headers: Dict[str, str]) -> Optional[Dict]:
"""Verify the feature → develop integration pull request exists."""
# Check both open and closed PRs since the workflow may have completed
success, prs = _get_github_api("pulls?state=all", headers)
if not success or not prs:
print("Error: Could not fetch pull requests", file=sys.stderr)
return None
for pr in prs:
head_ref = pr.get("head", {}).get("ref", "")
base_ref = pr.get("base", {}).get("ref", "")
if head_ref == "feature/protocol-serialization-fix" and base_ref == "develop":
return pr
print(
"Error: Integration PR from feature/protocol-serialization-fix to develop not found",
file=sys.stderr,
)
return None
def _check_release_branch_updated(headers: Dict[str, str]) -> bool:
"""Check if release branch contains the develop branch changes."""
# Check if PROTOCOL_FIXES.md exists in release branch
success, file_data = _get_github_api(
"contents/PROTOCOL_FIXES.md?ref=release/v1.0.0", headers
)
if not success or not file_data:
print(
"Error: PROTOCOL_FIXES.md not found in release branch - develop changes not merged",
file=sys.stderr,
)
return False
return True
def _check_process_documentation(headers: Dict[str, str]) -> Optional[Dict]:
"""Check if process is properly documented in an issue."""
success, issues = _get_github_api("issues", headers)
if not success or not issues:
print("Error: Could not fetch issues for documentation check", file=sys.stderr)
return None
expected_title = "Implement Advanced Branch Protection Strategy"
expected_checkboxes = [
"All development flows through develop branch",
"Release preparation happens in release/v1.0.0 branch",
"Feature integration uses PR workflow",
]
for issue in issues:
title = issue.get("title", "")
if title == expected_title:
body = issue.get("body", "")
# Check for exactly 3 checkboxes with specific content
checkbox_count = body.count("- [ ]") + body.count("- [x]")
if checkbox_count != 3:
print(
f"Error: Documentation issue should have 3 checkboxes, found {checkbox_count}",
file=sys.stderr,
)
return None
# Check for specific checkbox content
for expected_text in expected_checkboxes:
if expected_text not in body:
print(
f"Error: Documentation issue missing required checkbox: {expected_text}",
file=sys.stderr,
)
return None
# Check label assignment
labels = issue.get("labels", [])
label_names = [label.get("name") for label in labels]
if "process-implementation" not in label_names:
print(
"Error: Documentation issue not labeled with 'process-implementation'",
file=sys.stderr,
)
return None
return issue
print("Error: Process documentation issue not found", file=sys.stderr)
return None
def verify() -> bool:
"""
Verify the complete GitFlow implementation following the integrated workflow
described in description.md.
"""
# Get GitHub token
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("Verifying integrated GitFlow workflow implementation...")
# 1. Verify GitFlow structure initialization
print("1. Checking GitFlow branch structure...")
if not _check_gitflow_branches(headers):
return False
# 2. Verify critical bug fix implementation via new file
print("2. Checking protocol serialization fix documentation...")
if not _check_protocol_fixes_file(headers):
return False
# 3. Verify integration workflow (feature → develop PR)
print("3. Checking feature integration workflow...")
integration_pr = _check_integration_workflow(headers)
if not integration_pr:
return False
# 4. Verify release branch updated and CI configured
print("4. Checking release branch sync and CI configuration...")
if not _check_release_branch_updated(headers):
return False
# 5. Verify process documentation
print("5. Checking process documentation...")
doc_issue = _check_process_documentation(headers)
if not doc_issue:
return False
print("\n✓ Integrated GitFlow workflow successfully implemented!")
print("✓ GitFlow structure: main → develop → release/v1.0.0 branches created")
print("✓ Critical fix: Protocol fix documented in PROTOCOL_FIXES.md file")
print(
f"✓ Integration: PR #{integration_pr.get('number')} demonstrates feature → develop workflow"
)
print(
"✓ Release prep: Release branch contains develop changes, CI configured for both branches"
)
print(
f"✓ Documentation: Process documented in issue #{doc_issue.get('number')} with proper checkboxes"
)
print(
"\nThe repository now has a structured GitFlow workflow ready for implementation!"
)
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/easyr1/config_parameter_audit/description.md
================================================
I need you to perform a deep investigation into recent configuration changes in our EasyR1 repository that may be causing training instability issues.
## Task Requirements
### 1. Deep Commit Analysis
Find the exact commit SHA where the `micro_batch_size_per_device_for_update` parameter was changed from `4` to `1` in the `examples/config.yaml` file. Use GitHub API to:
- Examine recent commits that modified `examples/config.yaml`
- Get the specific commit diff showing this parameter change
- Identify the commit author and timestamp
### 2. Related Parameter Investigation
In the same commit you found above, identify what value the `micro_batch_size_per_device_for_experience` parameter was changed to. Document:
- The before value for this parameter
- The after value for this parameter
- The specific line numbers in the diff where these changes occurred
### 3. Issue Search and Verification
Search through all GitHub issues (both open and closed) to find issues that contain specific keywords. Identify all issue numbers where the issue title or body text contains any of these exact terms:
- "OOM" (case insensitive)
- "memory" (case insensitive)
- "batch" (case insensitive)
- "显存" (GPU memory in Chinese)
You must find and list ALL issues that contain any of these keywords in their titles or bodies, regardless of whether you think they're related to the parameter changes.
### 4. File Creation and Results
Create a file named exactly `ANALYSIS_RESULTS.json` in the repository root with this exact structure:
```json
{
"target_commit_sha": "full-40-character-commit-sha",
"commit_author": "author-username",
"commit_date": "YYYY-MM-DD",
"parameter_changes": {
"micro_batch_size_per_device_for_update": {
"before": 4,
"after": 1,
"line_number": 123
},
"micro_batch_size_per_device_for_experience": {
"before": 16,
"after": 2,
"line_number": 124
}
},
"related_issue_number_list": [9, 46]
}
```
### 5. Verification Requirements
- The commit SHA must be exactly 40 hexadecimal characters
- The parameter values must match the actual repository changes
- The issue number must reference a real issue in the repository
- All data must be obtained through GitHub API analysis, not guesswork
================================================
FILE: tasks/github/standard/easyr1/config_parameter_audit/meta.json
================================================
{
"task_id": "config_parameter_audit",
"task_name": "Config Parameter Audit",
"category_id": "easyr1",
"category_name": "EasyR1",
"description": "Investigate configuration changes causing training instability by analyzing commits and identifying related memory issues.",
"author": "Xiangyan Liu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"repository analysis",
"issue management"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/EasyR1",
"stateOriginalUrl": "https://github.com/hiyouga/EasyR1"
}
}
================================================
FILE: tasks/github/standard/easyr1/config_parameter_audit/verify.py
================================================
import sys
import os
import json
import requests
import re
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv
load_dotenv(".mcp_env")
def _get_github_api(
endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
github_org = os.environ.get("GITHUB_EVAL_ORG")
url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_analysis_results(headers: Dict[str, str]) -> Optional[Dict]:
"""Get ANALYSIS_RESULTS.json file content."""
success, file_data = _get_github_api("contents/ANALYSIS_RESULTS.json", headers)
if not success:
return None
# Decode base64 content
import base64
content = file_data.get("content", "")
if content:
try:
decoded_content = base64.b64decode(content).decode("utf-8")
return json.loads(decoded_content)
except Exception as e:
print(f"Error parsing JSON: {e}", file=sys.stderr)
return None
return None
def _verify_commit_data(results: Dict, headers: Dict[str, str]) -> bool:
"""Verify the commit data is accurate."""
commit_sha = results.get("target_commit_sha")
# Validate SHA format
if not re.match(r"^[a-f0-9]{40}$", commit_sha, re.IGNORECASE):
print(f"Error: Invalid commit SHA format: {commit_sha}", file=sys.stderr)
return False
# Get commit details
success, commit_data = _get_github_api(f"commits/{commit_sha}", headers)
if not success:
print(f"Error: Commit {commit_sha} not found in repository", file=sys.stderr)
return False
# Verify author
expected_author = results.get("commit_author")
actual_author = commit_data.get("author", {}).get("login")
if expected_author != actual_author:
print(
f"Error: Commit author mismatch. Expected: {expected_author}, Actual: {actual_author}",
file=sys.stderr,
)
return False
# Verify date format
commit_date = results.get("commit_date")
if not re.match(r"^\d{4}-\d{2}-\d{2}$", commit_date):
print(
f"Error: Invalid date format: {commit_date}. Expected YYYY-MM-DD",
file=sys.stderr,
)
return False
return True
def _verify_parameter_changes(results: Dict, headers: Dict[str, str]) -> bool:
"""Verify the parameter changes are accurate."""
param_changes = results.get("parameter_changes", {})
# Check required parameters exist
required_params = [
"micro_batch_size_per_device_for_update",
"micro_batch_size_per_device_for_experience",
]
for param in required_params:
if param not in param_changes:
print(f"Error: Missing parameter change data for: {param}", file=sys.stderr)
return False
change_data = param_changes[param]
if not all(key in change_data for key in ["before", "after", "line_number"]):
print(
f"Error: Incomplete change data for parameter: {param}", file=sys.stderr
)
return False
# Verify specific expected values based on known repository state
update_param = param_changes.get("micro_batch_size_per_device_for_update", {})
if update_param.get("before") != 4 or update_param.get("after") != 1:
print(
"Error: Incorrect values for micro_batch_size_per_device_for_update",
file=sys.stderr,
)
return False
experience_param = param_changes.get(
"micro_batch_size_per_device_for_experience", {}
)
if experience_param.get("before") != 16 or experience_param.get("after") != 2:
print(
"Error: Incorrect values for micro_batch_size_per_device_for_experience",
file=sys.stderr,
)
return False
return True
def _get_all_issues_with_keywords(headers: Dict[str, str]) -> set:
"""Find all issues in repository that contain the required keywords."""
required_keywords = ["oom", "memory", "batch", "显存"]
keyword_issues = set()
# Get all issues from repository (both open and closed)
page = 1
while True:
success, issues = _get_github_api(
f"issues?state=all&per_page=100&page={page}", headers
)
if not success or not issues:
break
for issue in issues:
issue_number = issue.get("number")
title = issue.get("title", "").lower()
body = issue.get("body", "").lower() if issue.get("body") else ""
issue_text = title + " " + body
# Check if any keyword appears in title or body
for keyword in required_keywords:
if keyword.lower() in issue_text:
keyword_issues.add(issue_number)
break
# If we got less than 100 issues, we're done
if len(issues) < 100:
break
page += 1
return keyword_issues
def _verify_issue_references(results: Dict, headers: Dict[str, str]) -> bool:
"""Verify the issue references contain the required keywords."""
issue_number_list = results.get("related_issue_number_list")
if not isinstance(issue_number_list, list) or len(issue_number_list) == 0:
print(
"Error: related_issue_number_list must be a non-empty list",
file=sys.stderr,
)
return False
# Required keywords to search for (case insensitive)
required_keywords = ["oom", "memory", "batch", "显存"]
# First, dynamically find all issues that contain the required keywords
expected_issues = _get_all_issues_with_keywords(headers)
print(expected_issues)
provided_issues = set(issue_number_list)
# Verify each provided issue contains at least one of the required keywords
for issue_number in issue_number_list:
if not isinstance(issue_number, int) or issue_number <= 0:
print(
f"Error: Invalid issue number format: {issue_number}", file=sys.stderr
)
return False
# Get issue details
success, issue_data = _get_github_api(f"issues/{issue_number}", headers)
if not success:
print(
f"Error: Issue #{issue_number} not found in repository", file=sys.stderr
)
return False
# Check if issue title or body contains any required keywords
title = issue_data.get("title", "").lower()
body = issue_data.get("body", "").lower() if issue_data.get("body") else ""
issue_text = title + " " + body
issue_has_keyword = False
for keyword in required_keywords:
if keyword.lower() in issue_text:
issue_has_keyword = True
break
if not issue_has_keyword:
print(
f"Error: Issue #{issue_number} does not contain any required keywords: {required_keywords}",
file=sys.stderr,
)
return False
# Verify agent found exactly the same issues as our dynamic search
if provided_issues != expected_issues:
missing = expected_issues - provided_issues
extra = provided_issues - expected_issues
if missing:
print(
f"Error: Missing issues that contain required keywords: {missing}",
file=sys.stderr,
)
if extra:
print(
f"Error: Extra issues that don't contain required keywords: {extra}",
file=sys.stderr,
)
return False
print(
f"✓ Found all {len(issue_number_list)} issues containing required keywords: {issue_number_list}"
)
return True
def verify() -> bool:
"""
Programmatically verify that the deep commit analysis meets the requirements.
"""
# Get GitHub token
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("Verifying deep commit analysis completion...")
# 1. Check ANALYSIS_RESULTS.json exists and is valid JSON
print("1. Checking ANALYSIS_RESULTS.json exists and is valid...")
results = _get_analysis_results(headers)
if not results:
print("Error: ANALYSIS_RESULTS.json not found or invalid JSON", file=sys.stderr)
return False
print("✓ Found valid ANALYSIS_RESULTS.json")
# 2. Verify commit data accuracy
print("2. Verifying commit data accuracy...")
if not _verify_commit_data(results, headers):
return False
print("✓ Commit SHA, author, and date verified")
# 3. Verify parameter changes accuracy
print("3. Verifying parameter changes accuracy...")
if not _verify_parameter_changes(results, headers):
return False
print("✓ Parameter changes verified with correct before/after values")
# 4. Verify issue references
print("4. Verifying issue references...")
if not _verify_issue_references(results, headers):
return False
print("\n✓ Task completed successfully!")
print("Deep commit analysis results verified:")
print(f"- Found target commit: {results.get('target_commit_sha')}")
print(
"- Verified parameter changes: micro_batch_size_per_device_for_update (4→1), micro_batch_size_per_device_for_experience (16→2)"
)
print(
f"- Verified memory/performance issue correlations: {results.get('related_issue_number_list')}"
)
print("- All data obtained through accurate GitHub API analysis")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/easyr1/performance_regression_investigation/description.md
================================================
In the EasyR1 repo, I've discovered that the recent commit `098931530606d22f867fd121b1dcb3225a43661f` (which fixed data proto) may have introduced performance regressions based on user reports in issues #39 and #41. I need you to create a systematic investigation workflow:
**Step 1: Create Main Tracking Issue**
Create a main issue with the exact title "Performance Regression Analysis: Data Protocol Changes" and add these 3 labels: "bug", "performance", "investigation".
**Step 2: Create Investigation Branches**
Create exactly 3 feature branches from main for different investigation tracks:
- `investigate-protocol-changes` - for testing protocol-related performance issues
- `investigate-batch-processing` - for testing batch processing performance issues
- `investigate-memory-usage` - for testing memory utilization performance issues
**Step 3: Create Sub-Issues**
Create 3 sub-issues and link them to the main tracking issue using sub-issue functionality:
- "Test Performance Impact: fix multi modal data oom"
- "Test Performance Impact: upgrade vllm to 0.10"
- "Test Performance Impact: non blocking false by default"
**Step 4: Document Changes**
Add at least 2 comments to the main tracking issue documenting the specific file changes from commit `098931530606d22f867fd121b1dcb3225a43661f`. Reference the exact files `verl/protocol.py` and `examples/config.yaml` with their commit SHA.
**Step 5: Create Analysis PR**
Create a pull request from the `investigate-protocol-changes` branch to main with the exact title "Performance Analysis: Protocol Changes Investigation".
================================================
FILE: tasks/github/standard/easyr1/performance_regression_investigation/meta.json
================================================
{
"task_id": "performance_regression_investigation",
"task_name": "Performance Regression Investigation",
"category_id": "easyr1",
"category_name": "EasyR1",
"description": "Create systematic investigation workflow for performance regressions with tracking issues, investigation branches, and sub-issues.",
"author": "Xiangyan Liu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"issue management",
"repository analysis"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/EasyR1",
"stateOriginalUrl": "https://github.com/hiyouga/EasyR1"
}
}
================================================
FILE: tasks/github/standard/easyr1/performance_regression_investigation/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv
load_dotenv(".mcp_env")
def _get_github_api(
endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
github_org = os.environ.get("GITHUB_EVAL_ORG")
url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _find_main_tracking_issue(headers: Dict[str, str]) -> Optional[Dict]:
"""Find the main tracking issue with exact title and required labels."""
success, issues = _get_github_api("issues?state=open&per_page=50", headers)
if not success or not issues:
return None
for issue in issues:
title = issue.get("title", "")
if title == "Performance Regression Analysis: Data Protocol Changes":
# Check labels
labels = [label.get("name", "") for label in issue.get("labels", [])]
required_labels = {"bug", "performance", "investigation"}
if required_labels.issubset(set(labels)):
return issue
return None
def _check_branches_exist(branch_names: List[str], headers: Dict[str, str]) -> bool:
"""Check if all required branches exist."""
for branch_name in branch_names:
success, _ = _get_github_api(f"branches/{branch_name}", headers)
if not success:
print(f"Error: Branch '{branch_name}' not found", file=sys.stderr)
return False
return True
def _check_sub_issues(
main_issue_number: int, expected_titles: List[str], headers: Dict[str, str]
) -> bool:
"""Check if sub-issues are created and linked to main issue."""
success, sub_issues = _get_github_api(
f"issues/{main_issue_number}/sub_issues", headers
)
if not success:
# If sub_issues endpoint doesn't exist, check for issues mentioning the main issue
success, all_issues = _get_github_api("issues?state=open&per_page=100", headers)
if not success:
return False
sub_issues = []
for issue in all_issues:
body = issue.get("body", "")
title = issue.get("title", "")
# Check if issue references main issue or has expected title pattern
if f"#{main_issue_number}" in body or any(
expected_title in title for expected_title in expected_titles
):
sub_issues.append(issue)
if not sub_issues or len(sub_issues) < 3:
print(
f"Error: Expected 3 sub-issues linked to main issue #{main_issue_number}",
file=sys.stderr,
)
return False
# Check if sub-issues have expected titles
found_titles = [issue.get("title", "") for issue in sub_issues]
for expected_title in expected_titles:
if not any(expected_title in title for title in found_titles):
print(
f"Error: Sub-issue with title containing '{expected_title}' not found",
file=sys.stderr,
)
return False
return True
def _check_issue_comments(issue_number: int, headers: Dict[str, str]) -> bool:
"""Check if main issue has at least 2 comments with file references."""
success, comments = _get_github_api(f"issues/{issue_number}/comments", headers)
if not success or not comments:
print(f"Error: No comments found on issue #{issue_number}", file=sys.stderr)
return False
if len(comments) < 2:
print(
f"Error: Expected at least 2 comments on issue #{issue_number}",
file=sys.stderr,
)
return False
# Check if comments reference specific files and commit
required_refs = [
"verl/protocol.py",
"examples/config.yaml",
"0989315",
]
comment_text = " ".join([comment.get("body", "") for comment in comments])
for ref in required_refs:
if ref not in comment_text:
print(f"Error: Comments missing reference to '{ref}'", file=sys.stderr)
return False
return True
def _find_analysis_pr(headers: Dict[str, str]) -> Optional[Dict]:
"""Find the analysis PR with exact title from specific branch."""
success, prs = _get_github_api("pulls?state=open&per_page=50", headers)
if not success or not prs:
return None
expected_title = "Performance Analysis: Protocol Changes Investigation"
expected_head = "investigate-protocol-changes"
for pr in prs:
title = pr.get("title", "")
head_ref = pr.get("head", {}).get("ref", "")
if title == expected_title and head_ref == expected_head:
return pr
return None
def verify() -> bool:
"""
Programmatically verify that the performance regression investigation workflow meets the
requirements described in description.md.
"""
# Get GitHub token
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying performance regression investigation workflow completion...")
# 1. Check main tracking issue exists with exact title and labels
print("1. Checking main tracking issue with required title and labels...")
main_issue = _find_main_tracking_issue(headers)
if not main_issue:
print(
"Error: Main tracking issue not found with exact title 'Performance Regression Analysis: Data Protocol Changes' and labels 'bug', 'performance', 'investigation'",
file=sys.stderr,
)
return False
main_issue_number = main_issue.get("number")
print(f"Found main tracking issue #{main_issue_number}")
# 2. Check that all 3 investigation branches exist
print("2. Checking investigation branches exist...")
required_branches = [
"investigate-protocol-changes",
"investigate-batch-processing",
"investigate-memory-usage",
]
if not _check_branches_exist(required_branches, headers):
return False
# 3. Check sub-issues are created and linked
print("3. Checking sub-issues are created and linked...")
expected_sub_titles = [
"Test Performance Impact: fix multi modal data oom",
"Test Performance Impact: upgrade vllm to 0.10",
"Test Performance Impact: non blocking false by default",
]
if not _check_sub_issues(main_issue_number, expected_sub_titles, headers):
return False
# 4. Check issue comments document file changes
print("4. Checking issue comments document file changes...")
if not _check_issue_comments(main_issue_number, headers):
return False
# 5. Check analysis PR exists with exact title from correct branch
print("5. Checking analysis PR exists with exact title and branch...")
analysis_pr = _find_analysis_pr(headers)
if not analysis_pr:
print(
"Error: Analysis PR not found with title 'Performance Analysis: Protocol Changes Investigation' from branch 'investigate-protocol-changes'",
file=sys.stderr,
)
return False
print(f"Found analysis PR #{analysis_pr.get('number')}")
print("\n✓ Task completed successfully!")
print(
f"Main tracking issue #{main_issue_number} created with proper labels and documentation"
)
print("All 3 investigation branches created for different investigation tracks")
print("3 sub-issues created and linked to main tracking issue")
print("Issue comments document file changes with commit SHA references")
print(f"Analysis PR #{analysis_pr.get('number')} created from correct branch")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/easyr1/qwen3_issue_management/description.md
================================================
The EasyR1 repository has several Qwen3-related issues that were closed but need to be reopened for further investigation. Qwen3 is an important model variant that requires continued attention. I need you to find and reopen all closed issues related to Qwen3 and properly tag them for tracking.
**Step 1: Find All Closed Qwen3 Issues**
Search for ALL closed issues that mention 'qwen3' (case-insensitive) in either the title or body. Make note of each issue number and title.
**Step 2: Reopen Each Qwen3 Issue**
For every closed issue that contains 'qwen3' (regardless of when it was closed or any other factors), reopen it by changing its state from closed to open.
**Step 3: Add Tracking Label**
After reopening each issue, add the label `qwen3-related` to it. This will help track all Qwen3-related issues in the future.
**Step 4: Create Summary Issue**
Create a new issue titled "Reopened Qwen3 Issues Summary" with the following content in the body:
```
# Qwen3 Issues Reopened
The following closed issues containing 'qwen3' have been reopened:
[List each reopened issue as: - #NUMBER: TITLE]
Total issues reopened: [NUMBER]
All reopened issues have been tagged with the `qwen3-related` label for easy tracking.
```
Add the label `qwen3-related` to this summary issue as well.
This straightforward workflow ensures all Qwen3-related closed issues are reopened and properly tagged for visibility.
================================================
FILE: tasks/github/standard/easyr1/qwen3_issue_management/meta.json
================================================
{
"task_id": "qwen3_issue_management",
"task_name": "Qwen3 Issue Management",
"category_id": "easyr1",
"category_name": "EasyR1",
"description": "Find and reopen all closed Qwen3-related issues with proper tagging for continued tracking and investigation.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"issue management"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/EasyR1",
"stateOriginalUrl": "https://github.com/hiyouga/EasyR1"
}
}
================================================
FILE: tasks/github/standard/easyr1/qwen3_issue_management/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv
load_dotenv(".mcp_env")
def _get_github_api(
endpoint: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
github_org = os.environ.get("GITHUB_EVAL_ORG")
url = f"https://api.github.com/repos/{github_org}/EasyR1/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _search_github_issues(
query: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[List]]:
"""Search GitHub issues using the search API."""
url = f"https://api.github.com/search/issues?q={query}&per_page=100"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return True, data.get("items", [])
else:
print(f"Search API error: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Search exception: {e}", file=sys.stderr)
return False, None
def _check_qwen3_issues_reopened(headers: Dict[str, str]) -> Tuple[bool, List]:
"""Check if all Qwen3 issues have been reopened and tagged."""
# Search for all issues mentioning qwen3 (both open and closed)
github_org = os.environ.get("GITHUB_EVAL_ORG")
success, all_qwen3_issues = _search_github_issues(
f"repo:{github_org}/EasyR1 qwen3", headers
)
if not success or not all_qwen3_issues:
print("Error: Could not search for Qwen3 issues", file=sys.stderr)
return False, []
reopened_issues = []
issues_not_reopened = []
issues_not_tagged = []
for issue in all_qwen3_issues:
issue_number = issue.get("number")
issue_state = issue.get("state")
issue_title = issue.get("title", "")
# Check if the issue is open (should be reopened)
if issue_state == "closed":
issues_not_reopened.append(f"#{issue_number}: {issue_title}")
continue
# Check if issue has qwen3-related label
labels = [label.get("name") for label in issue.get("labels", [])]
if "qwen3-related" not in labels:
issues_not_tagged.append(f"#{issue_number}: {issue_title}")
else:
reopened_issues.append(issue)
# Report any issues not properly processed
if issues_not_reopened:
print("Error: The following Qwen3 issues are still closed:", file=sys.stderr)
for issue in issues_not_reopened:
print(f" - {issue}", file=sys.stderr)
return False, []
if issues_not_tagged:
print(
"Error: The following reopened issues are missing 'qwen3-related' label:",
file=sys.stderr,
)
for issue in issues_not_tagged:
print(f" - {issue}", file=sys.stderr)
return False, reopened_issues
return True, reopened_issues
def _check_summary_issue(
headers: Dict[str, str], reopened_issues: List
) -> Optional[Dict]:
"""Check if the summary issue exists with proper content."""
success, issues = _get_github_api("issues?state=all", headers)
if not success or not issues:
print("Error: Could not fetch issues for summary check", file=sys.stderr)
return None
expected_title = "Reopened Qwen3 Issues Summary"
for issue in issues:
title = issue.get("title", "")
if title == expected_title:
body = issue.get("body", "")
# Check for required content
if "# Qwen3 Issues Reopened" not in body:
print("Error: Summary issue missing header", file=sys.stderr)
return None
if (
"The following closed issues containing 'qwen3' have been reopened:"
not in body
):
print("Error: Summary issue missing description", file=sys.stderr)
return None
if "Total issues reopened:" not in body:
print("Error: Summary issue missing total count", file=sys.stderr)
return None
if (
"All reopened issues have been tagged with the `qwen3-related` label"
not in body
):
print("Error: Summary issue missing tagging note", file=sys.stderr)
return None
# Check if all reopened issues are listed
for reopened_issue in reopened_issues:
issue_num = reopened_issue.get("number")
if f"#{issue_num}" not in body:
print(
f"Error: Summary issue missing reference to issue #{issue_num}",
file=sys.stderr,
)
return None
# Check if summary issue has the label
labels = [label.get("name") for label in issue.get("labels", [])]
if "qwen3-related" not in labels:
print(
"Error: Summary issue missing 'qwen3-related' label",
file=sys.stderr,
)
return None
return issue
print(
"Error: Summary issue 'Reopened Qwen3 Issues Summary' not found",
file=sys.stderr,
)
return None
def verify() -> bool:
"""
Verify that all Qwen3-related closed issues have been reopened and tagged.
"""
# Get GitHub token
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("Verifying Qwen3 issue reopening workflow...")
# 1. Check if all Qwen3 issues have been reopened and tagged
print("1. Checking if Qwen3 issues are reopened and tagged...")
all_reopened, reopened_issues = _check_qwen3_issues_reopened(headers)
if not all_reopened:
return False
if not reopened_issues:
print("Error: No Qwen3 issues found or reopened", file=sys.stderr)
return False
# 2. Check if summary issue exists
print("2. Checking summary issue...")
summary_issue = _check_summary_issue(headers, reopened_issues)
if not summary_issue:
return False
print("\n✓ Qwen3 issue reopening workflow successfully completed!")
print(f"✓ Reopened Issues: {len(reopened_issues)} Qwen3-related issues reopened")
print("✓ Tagging: All reopened issues tagged with 'qwen3-related' label")
print(
f"✓ Summary: Issue #{summary_issue.get('number')} created with complete list of reopened issues"
)
print("\nAll Qwen3-related closed issues have been reopened and properly tagged!")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/harmony/fix_conflict/description.md
================================================
I have some pull requests that won't merge due to conflicts. Can you help me fix the merge conflicts by creating the missing infrastructure?
**Step 1: Find Conflicted PR**
Look through the open pull requests and find the one that has `mergeable: false` and `mergeable_state: "dirty"`. Check what file it's trying to modify - it appears to be missing a file that the PR is trying to add or modify.
**Step 2: Create Infrastructure PR**
Create a new branch and PR to add the missing file that the conflicted PR needs. The PR must have:
- **Title**: Must contain "Add CI infrastructure" and "resolve conflicts"
- **Body**: Must include:
- Reference to the conflicted PR using "Fixes #[PR_NUMBER]" or "Resolves #[PR_NUMBER]"
- Explanation that this "prepares infrastructure" for the other PR
- Mention of "missing .github directory" and "workflow conflicts"
- **File Content**: Extract the complete file content from the conflicted PR's changes and add it to main. This ensures the conflicted PR can merge cleanly without conflicts.
**Step 3: Merge Infrastructure PR**
Merge the infrastructure PR to main.
**Step 4: Add Comment to Original PR**
Add a comment to the original conflicted PR that references the infrastructure PR you just created and merged. The comment must mention the infrastructure PR number using "PR #[NUMBER]" format.
**Step 5: Merge Original PR**
Now merge the original conflicted PR since it should be able to merge cleanly.
================================================
FILE: tasks/github/standard/harmony/fix_conflict/meta.json
================================================
{
"task_id": "fix_conflict",
"task_name": "Fix Conflict",
"category_id": "harmony",
"category_name": "Harmony",
"description": "Resolve merge conflicts by creating missing infrastructure and ensuring conflicted PRs can merge cleanly.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"pr workflows"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/harmony",
"stateOriginalUrl": "https://github.com/openai/harmony"
}
}
================================================
FILE: tasks/github/standard/harmony/fix_conflict/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_ci_file_exists(
file_path: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
"""Check if CI file exists in main branch."""
success, _ = _get_github_api(f"contents/{file_path}?ref=main", headers, org, repo)
return success
def _check_pr_comments(
pr_number: int,
infra_pr_number: int,
headers: Dict[str, str],
org: str,
repo: str = "harmony",
) -> bool:
"""Check if PR has a comment linking to the infrastructure PR using 'PR #[NUMBER]' format."""
success, comments = _get_github_api(
f"issues/{pr_number}/comments", headers, org, repo
)
if not success or not comments:
return False
# Look for "PR #123" pattern (case insensitive)
import re
for comment in comments:
body = comment.get("body", "")
if re.search(rf"PR\s*#{infra_pr_number}", body, re.IGNORECASE):
return True
return False
def _find_infrastructure_pr(
headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
"""Find the infrastructure PR by checking title and body content."""
success, prs = _get_github_api("pulls?state=all&per_page=50", headers, org, repo)
if success and prs:
for pr in prs:
title = pr.get("title", "").lower()
body = pr.get("body", "").lower()
# Check title contains required keywords
title_ok = "add ci infrastructure" in title and "resolve conflicts" in title
# Check body contains required elements
has_reference = "fixes #" in body or "resolves #" in body
has_prep_text = "prepares infrastructure" in body
has_github_text = "missing .github directory" in body
has_workflow_text = "workflow conflicts" in body
body_ok = (
has_reference
and has_prep_text
and has_github_text
and has_workflow_text
)
if title_ok and body_ok:
return pr
return None
def verify() -> bool:
"""
Programmatically verify that the merge conflict resolution workflow meets the
requirements described in description.md.
"""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying merge conflict resolution workflow completion...")
# 1. Check that CI infrastructure file exists in main (extracted from conflicted PR)
print("1. Checking CI infrastructure was added to main...")
# Check for both CI.yml and ci.yml (case-insensitive)
ci_exists = _check_ci_file_exists(".github/workflows/CI.yml", headers, github_org)
if not ci_exists:
ci_exists = _check_ci_file_exists(".github/workflows/ci.yml", headers, github_org)
if not ci_exists:
print("Error: Neither .github/workflows/CI.yml nor .github/workflows/ci.yml found in main", file=sys.stderr)
return False
# 2. Find infrastructure PR with required title and body content
print("2. Finding infrastructure PR with required content...")
infra_pr = _find_infrastructure_pr(headers, github_org)
if not infra_pr:
print(
"Error: No infrastructure PR found with required title and body content",
file=sys.stderr,
)
print(
"Required title: 'Add CI infrastructure' and 'resolve conflicts'",
file=sys.stderr,
)
print(
"Required body: reference with 'Fixes #' or 'Resolves #', 'prepares infrastructure', 'missing .github directory', 'workflow conflicts'",
file=sys.stderr,
)
return False
print(f"Found infrastructure PR #{infra_pr.get('number')}: {infra_pr.get('title')}")
# 3. Check that infrastructure PR is merged
if not infra_pr.get("merged_at"):
print(
f"Error: Infrastructure PR #{infra_pr.get('number')} not merged yet",
file=sys.stderr,
)
return False
# 4. Check that PR #24 is merged
print("3. Checking that PR #24 is merged...")
success, pr24 = _get_github_api("pulls/24", headers, github_org)
if not success or not pr24:
print("Error: PR #24 not found", file=sys.stderr)
return False
if not pr24.get("merged_at"):
print("Error: PR #24 is not merged yet", file=sys.stderr)
return False
# 5. Check that PR #24 has a comment linking to the infrastructure PR
print("4. Checking that PR #24 has comment linking to infrastructure PR...")
if not _check_pr_comments(24, infra_pr.get("number"), headers, github_org):
print(
f"Error: PR #24 missing comment linking to infrastructure PR #{infra_pr.get('number')}",
file=sys.stderr,
)
return False
print("\n✓ Task completed successfully!")
print(
f"Infrastructure PR #{infra_pr.get('number')} extracted content from PR #24 and resolved conflicts"
)
print(
"PR #24 is now merged cleanly and has a comment linking to the infrastructure PR"
)
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/description.md
================================================
I need you to implement a complete bug tracking and resolution workflow that demonstrates proper cross-referencing between issues, pull requests, and commits. Here's what you need to do:
**Step 1: Create Issue for Race Condition Bug**
Create a new issue with:
- Title containing: 'race condition', 'HarmonyEncoding', 'concurrent access'
- Body must include:
- A "## Problem" heading describing threading issues
- A "## Root Cause" heading about file locking
- A "## Expected Solution" heading with bullet points
- References to issues #6 and #1
- Keywords: "multiple threads", "tokenizer file downloads", "mutex-based file locking"
**Step 2: Create Feature Branch**
Create a new branch called 'fix/race-condition-tokenizer-loading' from main.
**Step 3: Implement Thread-Safe Loading**
On the feature branch, create/update the file `src/concurrent_loading.rs` with:
```rust
use std::sync::Mutex;
use std::sync::OnceLock;
// Thread-safe tokenizer loading with file locks
static DOWNLOAD_MUTEX: OnceLock> = OnceLock::new();
pub fn load_harmony_encoding_safe(name: &str) -> Result {
let _guard = DOWNLOAD_MUTEX.get_or_init(|| Mutex::new(())).lock().unwrap();
// Implementation for thread-safe loading
// Addresses race condition from issue #6
Ok(HarmonyEncoding::new())
}
pub fn load_harmony_encoding_from_file(path: &str) -> Result {
// Offline loading API as requested in issue #1
HarmonyEncoding::from_file(path)
}
```
**Step 4: Create Pull Request with Cross-References**
Create a pull request from 'fix/race-condition-tokenizer-loading' to 'main' with:
- Title containing: 'Fix race condition', 'tokenizer loading', 'threading issues'
- Body must include:
- A "## Summary" heading explaining the fix
- A "## Changes" heading with bullet points about mutex implementation
- A "## Testing" heading mentioning related issues
- "Closes #[ISSUE_NUMBER]" pattern linking to your created issue
- References to #1 and #6
- Keywords: "thread-safe", "concurrent downloads", "offline loading API"
**Step 5: Add PR Review Comments**
Create a pending review and add a review comment to the PR with:
- Technical analysis of the implementation approach
- Discussion of thread safety mechanisms
- Keywords that must be included: "OnceLock", "mutex", "thread safety", "concurrent access"
- Reference to issue #1 and the offline loading capability
- Explanation of how the solution prevents race conditions
Then submit the review as a COMMENT type review.
**Step 6: Update Issue with Implementation Details**
Add a comment to the original issue you created with:
- Reference to the PR number using "PR #[NUMBER]" pattern
- Technical details about the mutex-based solution
- Keywords: "std::sync::Mutex", "OnceLock", "thread-safe initialization"
- Mention of key implementation changes (DOWNLOAD_MUTEX, offline loading)
- Reference back to issue #1 for offline loading requirement
**Step 7: Close the Issue**
Close the issue you created by updating its state to 'closed' with state_reason 'completed'.
================================================
FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/meta.json
================================================
{
"task_id": "issue_pr_commit_workflow",
"task_name": "Issue Pr Commit Workflow",
"category_id": "harmony",
"category_name": "Harmony",
"description": "Implement complete bug tracking workflow demonstrating proper cross-referencing between issues, PRs, and commits for race condition fixes.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"issue management",
"pr workflows"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/harmony",
"stateOriginalUrl": "https://github.com/openai/harmony"
}
}
================================================
FILE: tasks/github/standard/harmony/issue_pr_commit_workflow/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_branch_exists(
branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
"""Verify that a branch exists in the repository."""
success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
return success
def _check_file_content(
branch: str,
file_path: str,
keywords: List[str],
headers: Dict[str, str],
org: str,
repo: str = "harmony",
) -> bool:
"""Verify that a file exists in branch and contains required keywords."""
success, result = _get_github_api(
f"contents/{file_path}?ref={branch}", headers, org, repo
)
if not success or not result:
return False
if keywords and result.get("content"):
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return all(keyword in content for keyword in keywords)
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return False
return True
def _find_issue_by_title(
title_substring: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
"""Find an issue by title substring and return the issue data."""
# Check both open and closed issues
for state in ["open", "closed"]:
success, issues = _get_github_api(
f"issues?state={state}&per_page=100", headers, org, repo
)
if success and issues:
for issue in issues:
if title_substring.lower() in issue.get("title", "").lower():
return issue
return None
def _find_pr_by_title(
title_substring: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
"""Find a PR by title substring and return the PR data."""
# Check both open and closed PRs
for state in ["open", "closed"]:
success, prs = _get_github_api(
f"pulls?state={state}&per_page=100", headers, org, repo
)
if success and prs:
for pr in prs:
if title_substring.lower() in pr.get("title", "").lower():
return pr
return None
def _check_issue_references(issue_body: str, reference_numbers: List[str]) -> bool:
"""Check if issue body contains references to specified issue numbers."""
if not issue_body:
return False
return all(f"#{ref}" in issue_body for ref in reference_numbers)
def _check_pr_references(
pr_body: str, issue_number: int, reference_numbers: List[str]
) -> bool:
"""Check if PR body contains proper references."""
if not pr_body:
return False
# Check for "Closes #X" pattern
closes_pattern = (
f"Closes #{issue_number}" in pr_body or f"closes #{issue_number}" in pr_body
)
# Check for other references
refs_present = all(f"#{ref}" in pr_body for ref in reference_numbers)
return closes_pattern and refs_present
def _get_issue_comments(
issue_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> List[Dict]:
"""Get all comments for an issue."""
success, comments = _get_github_api(
f"issues/{issue_number}/comments", headers, org, repo
)
if success and comments:
return comments
return []
def _get_pr_reviews(
pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> List[Dict]:
"""Get all reviews for a PR."""
success, reviews = _get_github_api(f"pulls/{pr_number}/reviews", headers, org, repo)
if success and reviews:
return reviews
return []
def _check_issue_comment_references(
comments: List[Dict], pr_number: int, keywords: List[str]
) -> bool:
"""Check if issue has a comment referencing the PR number with required technical keywords."""
for comment in comments:
body = comment.get("body", "")
has_pr_ref = (
f"PR #{pr_number}" in body
or f"PR#{pr_number}" in body
or f"pr #{pr_number}" in body.lower()
)
has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
if has_pr_ref and has_keywords:
return True
return False
def _check_title_keywords(title: str, required_keywords: List[str]) -> bool:
"""Check if title contains all required keywords."""
return all(keyword.lower() in title.lower() for keyword in required_keywords)
def _check_headings_and_content(
body: str, headings: List[str], keywords: List[str]
) -> bool:
"""Check if body contains required headings and keywords."""
has_headings = all(heading in body for heading in headings)
has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
return has_headings and has_keywords
def _check_pr_review_content(reviews: List[Dict], keywords: List[str]) -> bool:
"""Check if PR has review comments containing required keywords."""
for review in reviews:
body = review.get("body", "")
if body and all(keyword.lower() in body.lower() for keyword in keywords):
return True
return False
def verify() -> bool:
"""
Programmatically verify that the issue-PR-commit workflow meets the
requirements described in description.md.
"""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
# Configuration constants
BRANCH_NAME = "fix/race-condition-tokenizer-loading"
ISSUE_TITLE_SUBSTRING = "race condition in HarmonyEncoding"
PR_TITLE_SUBSTRING = "Fix race condition in tokenizer loading"
# File content checks
RUST_FILE_KEYWORDS = [
"DOWNLOAD_MUTEX",
"OnceLock>",
"load_harmony_encoding_safe",
"load_harmony_encoding_from_file",
"Thread-safe tokenizer loading",
]
# Issue content requirements
ISSUE_TITLE_KEYWORDS = ["race condition", "HarmonyEncoding", "concurrent access"]
ISSUE_REFERENCE_NUMBERS = ["6", "1"]
ISSUE_HEADINGS = ["## Problem", "## Root Cause", "## Expected Solution"]
ISSUE_KEYWORDS = [
"multiple threads",
"tokenizer file downloads",
"mutex-based file locking",
]
# PR content requirements
PR_TITLE_KEYWORDS = ["Fix race condition", "tokenizer loading", "threading issues"]
PR_REFERENCE_NUMBERS = ["1", "6"]
PR_HEADINGS = ["## Summary", "## Changes", "## Testing"]
PR_KEYWORDS = ["thread-safe", "concurrent downloads", "offline loading API"]
# Review comment requirements
REVIEW_KEYWORDS = ["OnceLock", "mutex", "thread safety", "concurrent access"]
# Issue comment requirements
ISSUE_COMMENT_KEYWORDS = [
"std::sync::Mutex",
"OnceLock",
"thread-safe initialization",
"DOWNLOAD_MUTEX",
]
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying GitHub issue-PR-commit workflow completion...")
# 1. Check that feature branch exists
print("1. Verifying feature branch exists...")
if not _check_branch_exists(BRANCH_NAME, headers, github_org):
print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr)
return False
# 2. Check that the Rust implementation file exists with required content
print("2. Verifying concurrent_loading.rs implementation...")
if not _check_file_content(
BRANCH_NAME,
"src/concurrent_loading.rs",
RUST_FILE_KEYWORDS,
headers,
github_org,
):
print(
"Error: src/concurrent_loading.rs not found or missing required content",
file=sys.stderr,
)
return False
# 3. Find the created issue
print("3. Verifying issue creation and content...")
issue = _find_issue_by_title(ISSUE_TITLE_SUBSTRING, headers, github_org)
if not issue:
print(
f"Error: Issue with title containing '{ISSUE_TITLE_SUBSTRING}' not found",
file=sys.stderr,
)
return False
issue_number = issue.get("number")
issue_title = issue.get("title", "")
issue_body = issue.get("body", "")
# Check issue title keywords
if not _check_title_keywords(issue_title, ISSUE_TITLE_KEYWORDS):
print("Error: Issue title missing required keywords", file=sys.stderr)
return False
# Check issue headings, content and references
if not _check_headings_and_content(issue_body, ISSUE_HEADINGS, ISSUE_KEYWORDS):
print("Error: Issue missing required headings or keywords", file=sys.stderr)
return False
if not _check_issue_references(issue_body, ISSUE_REFERENCE_NUMBERS):
print(
"Error: Issue does not reference required issues #6 and #1", file=sys.stderr
)
return False
# 4. Find the created PR
print("4. Verifying pull request creation and content...")
pr = _find_pr_by_title(PR_TITLE_SUBSTRING, headers, github_org)
if not pr:
print(
f"Error: PR with title containing '{PR_TITLE_SUBSTRING}' not found",
file=sys.stderr,
)
return False
pr_number = pr.get("number")
pr_title = pr.get("title", "")
pr_body = pr.get("body", "")
# Check PR title keywords
if not _check_title_keywords(pr_title, PR_TITLE_KEYWORDS):
print("Error: PR title missing required keywords", file=sys.stderr)
return False
# Check PR headings and content
if not _check_headings_and_content(pr_body, PR_HEADINGS, PR_KEYWORDS):
print("Error: PR missing required headings or keywords", file=sys.stderr)
return False
# Check PR references
if not _check_pr_references(pr_body, issue_number, PR_REFERENCE_NUMBERS):
print(
f"Error: PR does not properly reference issue #{issue_number} or issues #1, #6",
file=sys.stderr,
)
return False
# 5. Check PR review comments
print("5. Verifying PR review comments...")
reviews = _get_pr_reviews(pr_number, headers, github_org)
if not _check_pr_review_content(reviews, REVIEW_KEYWORDS):
print(
"Error: PR missing review comment with required technical keywords",
file=sys.stderr,
)
return False
# 6. Check issue comments for PR reference with technical keywords
print("6. Verifying issue comment referencing PR...")
issue_comments = _get_issue_comments(issue_number, headers, github_org)
if not _check_issue_comment_references(
issue_comments, pr_number, ISSUE_COMMENT_KEYWORDS
):
print(
f"Error: Issue #{issue_number} missing comment referencing PR #{pr_number} with required technical keywords",
file=sys.stderr,
)
return False
# 7. Check issue is closed
print("7. Verifying issue closure...")
if issue.get("state") != "closed":
print(f"Error: Issue #{issue_number} is not closed", file=sys.stderr)
return False
print("\n✓ All verification checks passed!")
print("Issue-PR-commit workflow completed successfully:")
print(f" - Issue #{issue_number}: {issue.get('title')}")
print(f" - PR #{pr_number}: {pr.get('title')}")
print(f" - Branch: {BRANCH_NAME}")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/description.md
================================================
I need you to simulate a realistic development workflow where an enhancement issue is created, implementation is attempted via a pull request, but then the PR must be closed without merging due to technical constraints discovered during the implementation process.
**Step 1: Create Enhancement Issue**
Create a new issue with:
- Title containing: "Upgrade JavaScript demo to use ESM imports" and "modern module system"
- Body must include:
- A "## Problem" heading describing CommonJS limitations
- A "## Proposed Solution" heading about ESM migration
- A "## Benefits" heading listing advantages
- Reference to issue #26 (which is about JavaScript demo issues)
- Keywords: "CommonJS", "ESM imports", "module bundling", "modern JavaScript"
- Labels: Add "enhancement" label to the issue
**Step 2: Create Feature Branch**
Create a new branch called 'feat/esm-migration-attempt' from main.
**Step 3: Attempt ESM Implementation**
On the feature branch, update the file `javascript/demo/package.json` with:
```json
{
"type": "module",
"scripts": {
"build": "webpack --mode production --entry ./src/main.js"
},
"dependencies": {
"@openai/harmony": "^0.1.0",
"webpack": "^5.0.0"
}
}
```
Also create `javascript/demo/src/main.js` with:
```javascript
// ESM import attempt - fails due to harmony core requirements
import { HarmonyEncoding } from '@openai/harmony';
// This breaks the existing CommonJS integration
// harmony core requires specific CommonJS patterns
export const initHarmony = () => {
throw new Error("ESM migration incompatible with harmony core");
};
```
**Step 4: Create Pull Request**
Create a pull request from 'feat/esm-migration-attempt' to 'main' with:
- Title containing: "Upgrade JavaScript demo to ESM imports" and "modern modules"
- Body must include:
- A "## Summary" heading explaining the attempted migration
- A "## Changes" heading with bullet points about ESM implementation
- A "## Issues Discovered" heading describing technical problems found
- "Addresses #[ISSUE_NUMBER]" pattern linking to your created issue
- Keywords: "ESM migration", "webpack configuration", "module compatibility", "breaking changes"
- Labels: Add "enhancement" and "needs-investigation" labels to the PR
**Step 5: Investigate and Document Problems**
Add a comment to the PR explaining the technical barriers discovered. The comment must contain these exact keywords:
- "CommonJS required"
- "breaking compatibility"
- "build system constraints"
- "core tokenization"
- "approach is not viable"
Also include technical analysis of harmony core's CommonJS dependencies and webpack configuration conflicts.
**Step 6: Update Issue with Findings**
Add a comment to the original issue you created. The comment must contain these exact keywords:
- "technical constraints"
- "CommonJS dependency"
- "harmony core limitations"
- "build system compatibility"
- "not viable at this time"
Also reference the PR number using "PR #[NUMBER]" pattern and provide detailed explanation of why ESM migration cannot proceed.
**Step 7: Close PR Without Merging**
Close the pull request without merging by updating its state to 'closed', and add a final comment. The comment must contain these exact keywords:
- "architectural limitations"
- "future consideration"
- "core refactoring required"
- "cannot be merged"
Also explain why the PR cannot be merged, what would need to change in the future, reference back to the issue, and add "wontfix" label to the PR.
**Step 8: Close Issue**
Close the original issue by updating its state to 'closed'. Add a final comment to the issue that must contain these exact keywords:
- "closing as not planned"
- "architectural constraints"
- "future implementation blocked"
- "requires core redesign"
================================================
FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/meta.json
================================================
{
"task_id": "issue_tagging_pr_closure",
"task_name": "Issue Tagging Pr Closure",
"category_id": "harmony",
"category_name": "Harmony",
"description": "Simulate development workflow where enhancement PR is closed without merging due to technical constraints discovered during implementation.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"issue management",
"pr workflows"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/harmony",
"stateOriginalUrl": "https://github.com/openai/harmony"
}
}
================================================
FILE: tasks/github/standard/harmony/issue_tagging_pr_closure/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_branch_exists(
branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
"""Verify that a branch exists in the repository."""
success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
return success
def _check_file_content(
branch: str,
file_path: str,
keywords: List[str],
headers: Dict[str, str],
org: str,
repo: str = "harmony",
) -> bool:
"""Verify that a file exists in branch and contains required keywords."""
import base64
success, result = _get_github_api(
f"contents/{file_path}?ref={branch}", headers, org, repo
)
if not success or not result:
return False
if keywords and result.get("content"):
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return all(keyword in content for keyword in keywords)
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return False
return True
def _find_issue_by_title_keywords(
title_keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
"""Find an issue by title keywords and return the issue data."""
for state in ["open", "closed"]:
success, issues = _get_github_api(
f"issues?state={state}&per_page=100", headers, org, repo
)
if success and issues:
for issue in issues:
title = issue.get("title", "").lower()
if all(keyword.lower() in title for keyword in title_keywords):
return issue
return None
def _find_pr_by_title_keywords(
title_keywords: List[str], headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Optional[Dict]:
"""Find a PR by title keywords and return the PR data."""
for state in ["open", "closed"]:
success, prs = _get_github_api(
f"pulls?state={state}&per_page=100", headers, org, repo
)
if success and prs:
for pr in prs:
title = pr.get("title", "").lower()
if all(keyword.lower() in title for keyword in title_keywords):
return pr
return None
def _check_labels(labels: List[Dict], required_labels: List[str]) -> bool:
"""Check if required labels are present."""
label_names = [label.get("name", "").lower() for label in labels]
return all(req_label.lower() in label_names for req_label in required_labels)
def _check_headings_and_keywords(
body: str, headings: List[str], keywords: List[str]
) -> bool:
"""Check if body contains required headings and keywords."""
if not body:
return False
has_headings = all(heading in body for heading in headings)
has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
return has_headings and has_keywords
def _check_issue_reference(body: str, issue_number: int) -> bool:
"""Check if body contains reference to the issue."""
if not body:
return False
return f"#{issue_number}" in body or f"Addresses #{issue_number}" in body
def _get_issue_comments(
issue_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> List[Dict]:
"""Get all comments for an issue."""
success, comments = _get_github_api(
f"issues/{issue_number}/comments", headers, org, repo
)
if success and comments:
return comments
return []
def _get_pr_comments(
pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> List[Dict]:
"""Get all comments for a PR."""
success, comments = _get_github_api(
f"issues/{pr_number}/comments", headers, org, repo
)
if success and comments:
return comments
return []
def _check_pr_technical_comment(comments: List[Dict], keywords: List[str]) -> bool:
"""Check if PR has a comment with technical analysis containing required keywords."""
for comment in comments:
body = comment.get("body", "")
if body and all(keyword.lower() in body.lower() for keyword in keywords):
return True
return False
def _check_issue_comment_with_pr_ref(
comments: List[Dict], pr_number: int, keywords: List[str]
) -> bool:
"""Check if issue has a comment referencing the PR with required keywords."""
for comment in comments:
body = comment.get("body", "")
has_pr_ref = (
f"PR #{pr_number}" in body
or f"PR#{pr_number}" in body
or f"pr #{pr_number}" in body.lower()
)
has_keywords = all(keyword.lower() in body.lower() for keyword in keywords)
if has_pr_ref and has_keywords:
return True
return False
def verify() -> bool:
"""
Programmatically verify that the issue tagging and PR closure workflow meets the
requirements described in description.md.
"""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
# Configuration constants
BRANCH_NAME = "feat/esm-migration-attempt"
# Issue requirements
ISSUE_TITLE_KEYWORDS = [
"Upgrade JavaScript demo to use ESM imports",
"modern module system",
]
ISSUE_HEADINGS = ["## Problem", "## Proposed Solution", "## Benefits"]
ISSUE_KEYWORDS = ["CommonJS", "ESM imports", "module bundling", "modern JavaScript"]
ISSUE_LABELS = ["enhancement"]
# PR requirements
PR_TITLE_KEYWORDS = ["Upgrade JavaScript demo to ESM imports", "modern modules"]
PR_HEADINGS = ["## Summary", "## Changes", "## Issues Discovered"]
PR_KEYWORDS = [
"ESM migration",
"webpack configuration",
"module compatibility",
"breaking changes",
]
PR_LABELS = ["enhancement", "needs-investigation", "wontfix"]
# File content requirements
PACKAGE_JSON_KEYWORDS = ['"type": "module"', "webpack", "@openai/harmony"]
MAIN_JS_KEYWORDS = [
"import { HarmonyEncoding }",
"ESM import attempt",
"harmony core",
]
# Comment requirements
PR_TECHNICAL_KEYWORDS = [
"CommonJS required",
"breaking compatibility",
"build system constraints",
"core tokenization",
"approach is not viable",
]
ISSUE_COMMENT_KEYWORDS = [
"technical constraints",
"CommonJS dependency",
"harmony core limitations",
"build system compatibility",
"not viable at this time",
]
PR_CLOSURE_KEYWORDS = [
"architectural limitations",
"future consideration",
"core refactoring required",
"cannot be merged",
]
ISSUE_CLOSURE_KEYWORDS = [
"closing as not planned",
"architectural constraints",
"future implementation blocked",
"requires core redesign",
]
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying issue tagging and PR closure workflow completion...")
# 1. Check that feature branch exists
print("1. Verifying feature branch exists...")
if not _check_branch_exists(BRANCH_NAME, headers, github_org):
print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr)
return False
# 2. Check that implementation files exist with required content
print("2. Verifying ESM implementation files...")
if not _check_file_content(
BRANCH_NAME,
"javascript/demo/package.json",
PACKAGE_JSON_KEYWORDS,
headers,
github_org,
):
print(
"Error: javascript/demo/package.json not found or missing required content",
file=sys.stderr,
)
return False
if not _check_file_content(
BRANCH_NAME,
"javascript/demo/src/main.js",
MAIN_JS_KEYWORDS,
headers,
github_org,
):
print(
"Error: javascript/demo/src/main.js not found or missing required content",
file=sys.stderr,
)
return False
# 3. Find the created issue
print("3. Verifying issue creation and content...")
issue = _find_issue_by_title_keywords(ISSUE_TITLE_KEYWORDS, headers, github_org)
if not issue:
print(
"Error: Issue with title containing required keywords not found",
file=sys.stderr,
)
return False
issue_number = issue.get("number")
issue_body = issue.get("body", "")
issue_labels = issue.get("labels", [])
# Check issue content
if not _check_headings_and_keywords(issue_body, ISSUE_HEADINGS, ISSUE_KEYWORDS):
print("Error: Issue missing required headings or keywords", file=sys.stderr)
return False
# Check issue references #26
if "#26" not in issue_body:
print("Error: Issue does not reference issue #26", file=sys.stderr)
return False
# Check issue labels
if not _check_labels(issue_labels, ISSUE_LABELS):
print(f"Error: Issue missing required labels: {ISSUE_LABELS}", file=sys.stderr)
return False
# 4. Find the created PR
print("4. Verifying pull request creation and content...")
pr = _find_pr_by_title_keywords(PR_TITLE_KEYWORDS, headers, github_org)
if not pr:
print(
"Error: PR with title containing required keywords not found",
file=sys.stderr,
)
return False
pr_number = pr.get("number")
pr_body = pr.get("body", "")
pr_labels = pr.get("labels", [])
pr_state = pr.get("state")
# Check PR content
if not _check_headings_and_keywords(pr_body, PR_HEADINGS, PR_KEYWORDS):
print("Error: PR missing required headings or keywords", file=sys.stderr)
return False
# Check PR references issue
if not _check_issue_reference(pr_body, issue_number):
print(f"Error: PR does not reference issue #{issue_number}", file=sys.stderr)
return False
# Check PR labels
if not _check_labels(pr_labels, PR_LABELS):
print(f"Error: PR missing required labels: {PR_LABELS}", file=sys.stderr)
return False
# 5. Check PR is closed (not merged)
print("5. Verifying PR is closed without merging...")
if pr_state != "closed":
print(f"Error: PR #{pr_number} is not closed", file=sys.stderr)
return False
if pr.get("merged_at"):
print(
f"Error: PR #{pr_number} was merged (should be closed without merging)",
file=sys.stderr,
)
return False
# 6. Check PR technical analysis comment
print("6. Verifying PR technical analysis comment...")
pr_comments = _get_pr_comments(pr_number, headers, github_org)
if not _check_pr_technical_comment(pr_comments, PR_TECHNICAL_KEYWORDS):
print(
"Error: PR missing technical analysis comment with required keywords",
file=sys.stderr,
)
return False
# 7. Check issue comment with PR reference
print("7. Verifying issue comment referencing PR...")
issue_comments = _get_issue_comments(issue_number, headers, github_org)
if not _check_issue_comment_with_pr_ref(
issue_comments, pr_number, ISSUE_COMMENT_KEYWORDS
):
print(
f"Error: Issue #{issue_number} missing comment referencing PR #{pr_number} with required keywords",
file=sys.stderr,
)
return False
# 8. Check PR closure comment with required keywords
print("8. Verifying PR closure comment...")
pr_closure_comment_found = False
for comment in pr_comments:
body = comment.get("body", "")
if body and all(
keyword.lower() in body.lower() for keyword in PR_CLOSURE_KEYWORDS
):
pr_closure_comment_found = True
break
if not pr_closure_comment_found:
print(
"Error: PR missing closure comment with required keywords", file=sys.stderr
)
return False
# 9. Verify issue is closed
print("9. Verifying issue is closed...")
if issue.get("state") != "closed":
print(f"Error: Issue #{issue_number} should be closed", file=sys.stderr)
return False
# 10. Check issue closure comment with required keywords
print("10. Verifying issue closure comment...")
issue_closure_comment_found = False
for comment in issue_comments:
body = comment.get("body", "")
if body and all(
keyword.lower() in body.lower() for keyword in ISSUE_CLOSURE_KEYWORDS
):
issue_closure_comment_found = True
break
if not issue_closure_comment_found:
print(
"Error: Issue missing closure comment with required keywords",
file=sys.stderr,
)
return False
print("\n✓ All verification checks passed!")
print("Issue tagging and PR closure workflow completed successfully:")
print(f" - Issue #{issue_number}: {issue.get('title')} (closed)")
print(f" - PR #{pr_number}: {pr.get('title')} (closed without merging)")
print(f" - Branch: {BRANCH_NAME}")
print(" - All comments contain required keywords")
print(" - Technical constraints properly documented and communicated")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/description.md
================================================
I need you to create a comprehensive commit history report by aggregating changes from multiple branches. Here's what you need to do:
**Step 1: Create Analysis Branch**
Create a new branch called 'history-report-2025' from the main branch.
**Step 2: Generate Branch Commits Report**
In the 'history-report-2025' branch, create a file called `BRANCH_COMMITS.json` that contains a JSON object with the following structure:
- For each of these branches: ['pr/45-googlefan256-main', 'pr/25-neuralsorcerer-patch-1', 'pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api']
- List the 3 most recent commits for each branch
- Each commit must include: SHA, GitHub username, commit message, and files changed count
- The JSON structure should be:
```json
{
"pr/45-googlefan256-main": [
{
"sha": "commit_sha",
"author": "github_username",
"message": "commit message",
"files_changed": number
}
],
"pr/25-neuralsorcerer-patch-1": [...],
"pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api": [...]
}
```
**Step 3: Create Cross-Branch Analysis**
Create a file `CROSS_BRANCH_ANALYSIS.md` that contains:
- A section "## Top Contributors" listing the 3 contributors with the most commits on the main branch, sorted by commit count (format: "github_username: X commits")
- Must include keywords: "contributors"
**Step 4: Generate Merge Timeline**
Create a file `MERGE_TIMELINE.txt` that lists the 10 most recent merge commits from the main branch:
- Format: `DATE | MERGE_COMMIT_MESSAGE | COMMIT_SHA`
- List in reverse chronological order (newest first)
- Only include actual merge commits (commits that have exactly 2 parent commits)
- Note: While the commit messages reference PR numbers, those PRs no longer exist in the repository
================================================
FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/meta.json
================================================
{
"task_id": "multi_branch_commit_aggregation",
"task_name": "Multi Branch Commit Aggregation",
"category_id": "harmony",
"category_name": "Harmony",
"description": "Generate comprehensive commit history report by aggregating changes from multiple branches with contributor analysis and merge timeline.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"repository analysis",
"release coordination"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/harmony",
"stateOriginalUrl": "https://github.com/openai/harmony"
}
}
================================================
FILE: tasks/github/standard/harmony/multi_branch_commit_aggregation/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple
import base64
import json
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/harmony/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_branch_exists(branch_name: str, headers: Dict[str, str], org: str) -> bool:
"""Verify that a branch exists in the repository."""
success, _ = _get_github_api(f"branches/{branch_name}", headers, org)
return success
def _get_file_content(
branch: str, file_path: str, headers: Dict[str, str], org: str
) -> Optional[str]:
"""Get the content of a file from a specific branch."""
success, result = _get_github_api(f"contents/{file_path}?ref={branch}", headers, org)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def _check_branch_commits_json(content: str) -> bool:
"""Verify BRANCH_COMMITS.json has correct structure and expected data."""
expected_data = {
"pr/45-googlefan256-main": [
{
"sha": "9fa3f54cf2a2501c7dcbf554d5fbdd0de619fdda",
"author": "googlefan256",
"message": "Update format.md",
"files_changed": 1,
},
{
"sha": "3efbf742533a375fc148d75513597e139329578b",
"author": "scott-oai",
"message": "Merge pull request #29 from axion66/improve-readme-and-checks",
"files_changed": 1,
},
{
"sha": "9d653a4c7382abc42d115014d195d9354e7ad357",
"author": "scott-oai",
"message": "Merge pull request #30 from Yuan-ManX/harmony-format",
"files_changed": 1,
},
],
"pr/25-neuralsorcerer-patch-1": [
{
"sha": "c505a03e9c9a388a511b6125756097eee523742a",
"author": "neuralsorcerer",
"message": "fix: `meta_sep` token and add to registry",
"files_changed": 1,
},
{
"sha": "c044bf33f7e835ca6a723ccc97848de25dba5164",
"author": "neuralsorcerer",
"message": "fix: `meta_sep` token in `encoding.rs`",
"files_changed": 1,
},
{
"sha": "b255cbeb6274adbea774f26fd9590922ce8874ed",
"author": "scott-oai",
"message": "Merge pull request #18 from openai/dev/scl/better-ci",
"files_changed": 6,
},
],
"pr/41-amirhosseinghanipour-fix-race-conditions-and-offline-api": [
{
"sha": "1dca6392934bf4e3c403b2ecc2104e8ff3f67f45",
"author": "amirhosseinghanipour",
"message": "fix race conditions and add offline tokenizer loading api",
"files_changed": 8,
},
{
"sha": "9528c7b4a00a3307fd9685fc1328aee11c3d9c90",
"author": "scott-oai",
"message": "version bump",
"files_changed": 2,
},
{
"sha": "82b3afb9eb043343f322c937262cc50405e892c3",
"author": "scott-oai",
"message": "Merge pull request #26 from jordan-wu-97/jordan/fix-function-call-atomic-bool",
"files_changed": 6,
},
],
}
try:
data = json.loads(content)
# Check if all required branches are present
for branch in expected_data.keys():
if branch not in data:
print(
f"Missing branch {branch} in BRANCH_COMMITS.json", file=sys.stderr
)
return False
# Verify the exact content matches expected data
for branch, expected_commits in expected_data.items():
actual_commits = data.get(branch, [])
if len(actual_commits) != 3:
print(
f"Branch {branch} should have exactly 3 commits, found {len(actual_commits)}",
file=sys.stderr,
)
return False
for i, expected_commit in enumerate(expected_commits):
if i >= len(actual_commits):
print(
f"Missing commit {i + 1} for branch {branch}", file=sys.stderr
)
return False
actual_commit = actual_commits[i]
for field in ["sha", "author", "files_changed"]:
if actual_commit.get(field) != expected_commit.get(field):
print(
f"Mismatch in {field} for commit {i + 1} in branch {branch}",
file=sys.stderr,
)
print(
f"Expected: {expected_commit.get(field)}, Got: {actual_commit.get(field)}",
file=sys.stderr,
)
return False
# For message field, use substring matching to be more flexible
expected_message = expected_commit.get("message", "")
actual_message = actual_commit.get("message", "")
if expected_message not in actual_message:
print(
f"Mismatch in message for commit {i + 1} in branch {branch}",
file=sys.stderr,
)
print(
f"Expected: {expected_message}, Got: {actual_message}",
file=sys.stderr,
)
return False
return True
except json.JSONDecodeError as e:
print(f"Invalid JSON in BRANCH_COMMITS.json: {e}", file=sys.stderr)
return False
except Exception as e:
print(f"Error checking BRANCH_COMMITS.json: {e}", file=sys.stderr)
return False
def _check_cross_branch_analysis(content: str) -> bool:
"""Verify CROSS_BRANCH_ANALYSIS.md contains required sections and data."""
# Check for required section header
if "## Top Contributors" not in content:
print(
"Missing section '## Top Contributors' in CROSS_BRANCH_ANALYSIS.md",
file=sys.stderr,
)
return False
# Check for required keyword
if "contributors" not in content.lower():
print(
"Missing keyword 'contributors' in CROSS_BRANCH_ANALYSIS.md",
file=sys.stderr,
)
return False
# Verify the top 3 contributors with correct counts from main branch (order matters)
expected_contributors = [
"scott-oai: 35 commits",
"egorsmkv: 4 commits",
"axion66: 2 commits",
]
for contributor in expected_contributors:
if contributor not in content:
print(
f"Missing or incorrect contributor entry: {contributor}",
file=sys.stderr,
)
return False
return True
def _check_merge_timeline(content: str) -> bool:
"""Verify MERGE_TIMELINE.txt has correct format and expected merge commits."""
expected_timeline = [
"2025-08-06 | Merge pull request #29 from axion66/improve-readme-and-checks | 3efbf742533a375fc148d75513597e139329578b",
"2025-08-06 | Merge pull request #30 from Yuan-ManX/harmony-format | 9d653a4c7382abc42d115014d195d9354e7ad357",
"2025-08-06 | Merge pull request #28 from dkqjrm/fix-typo-format-md | 161e5fe2a57c63e9f8353c4c5b8faa3c3854bb5f",
"2025-08-05 | Merge pull request #26 from jordan-wu-97/jordan/fix-function-call-atomic-bool | 82b3afb9eb043343f322c937262cc50405e892c3",
"2025-08-05 | Merge pull request #18 from openai/dev/scl/better-ci | b255cbeb6274adbea774f26fd9590922ce8874ed",
"2025-08-05 | Merge pull request #21 from Tialo/main | 058ef3257c24fb099aac7960c10ce51c8e55d9fe",
"2025-08-05 | Merge branch 'main' into dev/scl/better-ci | 6375a15ea1b0a486cbb1468964cf8f5800ff5a5c",
"2025-08-05 | Merge pull request #8 from RustedBytes/main | f6179119ca894eda4124c86d408c01fdbf5281f0",
"2025-08-05 | Merge branch 'main' into main | eb86106b6980790b94f5702dc510483c66027277",
"2025-08-05 | Merge pull request #17 from openai/dev/scl/add-docs-to-cargo | 64bca4cf327ebeafa0bbd0345650d86e2d02142f",
]
# Verify each expected timeline entry exists in the content
for i, expected_line in enumerate(expected_timeline):
if expected_line not in content:
print(f"Missing expected timeline entry {i + 1} in MERGE_TIMELINE.txt", file=sys.stderr)
print(f"Expected: {expected_line}", file=sys.stderr)
return False
return True
def verify_task() -> bool:
"""Verify the multi-branch commit aggregation task."""
# Get GitHub token from environment
load_dotenv(".mcp_env")
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
# Get GitHub organization from environment
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# 1. Check if branch 'history-report-2025' exists
if not _check_branch_exists("history-report-2025", headers, github_org):
print("Branch 'history-report-2025' does not exist", file=sys.stderr)
return False
print("✓ Branch 'history-report-2025' exists")
# 2. Check BRANCH_COMMITS.json
content = _get_file_content("history-report-2025", "BRANCH_COMMITS.json", headers, github_org)
if not content:
print(
"File 'BRANCH_COMMITS.json' not found in 'history-report-2025' branch",
file=sys.stderr,
)
return False
if not _check_branch_commits_json(content):
return False
print("✓ BRANCH_COMMITS.json has correct structure and data")
# 3. Check CROSS_BRANCH_ANALYSIS.md
content = _get_file_content(
"history-report-2025", "CROSS_BRANCH_ANALYSIS.md", headers, github_org
)
if not content:
print(
"File 'CROSS_BRANCH_ANALYSIS.md' not found in 'history-report-2025' branch",
file=sys.stderr,
)
return False
if not _check_cross_branch_analysis(content):
return False
print("✓ CROSS_BRANCH_ANALYSIS.md contains required sections and data")
# 4. Check MERGE_TIMELINE.txt
content = _get_file_content("history-report-2025", "MERGE_TIMELINE.txt", headers, github_org)
if not content:
print(
"File 'MERGE_TIMELINE.txt' not found in 'history-report-2025' branch",
file=sys.stderr,
)
return False
if not _check_merge_timeline(content):
return False
print("✓ MERGE_TIMELINE.txt has correct format and data")
print("\nAll verification checks passed! ✅")
return True
if __name__ == "__main__":
success = verify_task()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/harmony/release_management_workflow/description.md
================================================
I need help implementing a comprehensive release management workflow for this harmony repository. Here's what I need you to do:
**Step 1: Analyze Current State**
First, analyze the current open pull requests to understand what changes they contain and their impact on the codebase.
**Step 2: Create Release Branch**
Create a release preparation branch called 'release-v1.1.0' from the current main branch.
**Step 3: Apply Critical Bug Fixes**
On the release branch, apply the MetaSep token fix from PR #25 by creating/updating the file `src/encoding.rs` with the corrected content where FormattingToken::MetaSep maps to "<|meta_sep|>" instead of "<|channel|>".
Also create/update `src/registry.rs` to include the missing MetaSep and MetaEnd token registrations:
```rust
(FormattingToken::MetaSep, "<|meta_sep|>"),
(FormattingToken::MetaEnd, "<|meta_end|>"),
```
**Step 4: Add Missing Utility File**
From PR #26, create the missing shadcn utils file `demo/harmony-demo/src/lib/utils.ts` with content:
```typescript
import { clsx, type ClassValue } from "clsx"
import { twMerge } from "tailwind-merge"
export function cn(...inputs: ClassValue[]) {
return twMerge(clsx(inputs))
}
```
And create/update `.gitignore` to add:
```
# Avoid ignoring shadcn utils
!demo/harmony-demo/src/lib
```
**Step 5: Version Update**
Update the version number in `Cargo.toml`: Change the `version` field in the `[package]` section to `version = "1.1.0"`.
**Step 6: Create Comprehensive Changelog**
Create a `CHANGELOG.md` file in the release branch with the following content:
```markdown
# Changelog
## [1.1.0] - 2025-08-07
### Added
- Added missing shadcn utils.ts file for demo application
- Enhanced gitignore rules to preserve shadcn utilities
### Fixed
- Fixed MetaSep token mapping bug (was incorrectly mapped to channel token)
- Added missing MetaSep and MetaEnd token registrations in registry
- Improved tokenizer registry functionality for meta formatting tokens
### Changed
- Updated version to 1.1.0 for new release cycle
### Technical Details
- MetaSep token now correctly maps to `<|meta_sep|>` instead of `<|channel|>`
- Registry now properly recognizes MetaSep and MetaEnd formatting tokens
- Demo application now includes required utility functions for UI components
```
**Step 7: Create Release Pull Request**
Create a pull request from 'release-v1.1.0' to 'main' with title "Release v1.1.0 - Bug fixes and utility additions" and a detailed description explaining all the integrated changes.
**Step 8: Merge the Pull Request**
After creating the PR, merge it into the main branch using the "squash and merge" method.
**Step 9: Verification**
Ensure the release branch contains at least 4 distinct commits before merging:
1. MetaSep token fix commit
2. Utility file addition commit
3. Version update commit
4. Changelog addition commit
================================================
FILE: tasks/github/standard/harmony/release_management_workflow/meta.json
================================================
{
"task_id": "release_management_workflow",
"task_name": "Release Management Workflow",
"category_id": "harmony",
"category_name": "Harmony",
"description": "Implement comprehensive release management workflow including bug fixes, version updates, changelog creation, and PR merging.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"release coordination",
"pr workflows"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/harmony",
"stateOriginalUrl": "https://github.com/openai/harmony"
}
}
================================================
FILE: tasks/github/standard/harmony/release_management_workflow/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_branch_exists(
branch_name: str, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
"""Verify that a branch exists in the repository."""
success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
return success
def _check_file_content(
branch: str,
file_path: str,
keywords: List[str],
headers: Dict[str, str],
org: str,
repo: str = "harmony",
) -> bool:
"""Verify that a file exists in branch and contains required keywords."""
success, result = _get_github_api(
f"contents/{file_path}?ref={branch}", headers, org, repo
)
if not success or not result:
return False
if keywords and result.get("content"):
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return all(keyword in content for keyword in keywords)
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return False
return True
def _check_specific_file_content(
branch: str,
file_path: str,
expected_content: str,
headers: Dict[str, str],
org: str,
repo: str = "harmony",
min_length: int = 100,
) -> bool:
"""Verify that a file contains specific exact content and has reasonable size."""
success, result = _get_github_api(
f"contents/{file_path}?ref={branch}", headers, org, repo
)
if not success or not result:
return False
if result.get("content"):
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
# Check both that expected content exists and file has reasonable content
return expected_content in content and len(content) >= min_length
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return False
return False
def _check_pr_merged(
title_substring: str,
base_branch: str,
headers: Dict[str, str],
org: str,
repo: str = "harmony",
) -> Tuple[bool, Optional[int]]:
"""Check if a PR with specified title was merged into base branch and return PR number."""
# Check closed PRs to find merged ones
success, prs = _get_github_api(
"pulls?state=closed&per_page=100", headers, org, repo
)
if not success or not prs:
return False, None
for pr in prs:
title_match = title_substring.lower() in pr.get("title", "").lower()
base_match = pr.get("base", {}).get("ref") == base_branch
is_merged = pr.get("merged_at") is not None
if title_match and base_match and is_merged:
return True, pr.get("number")
return False, None
def _check_pr_squash_merged(
pr_number: int, headers: Dict[str, str], org: str, repo: str = "harmony"
) -> bool:
"""Check if a PR was merged using squash and merge method."""
# Get the PR details
success, pr = _get_github_api(f"pulls/{pr_number}", headers, org, repo)
if not success or not pr:
return False
if not pr.get("merged_at"):
return False
merge_commit_sha = pr.get("merge_commit_sha")
if not merge_commit_sha:
return False
# Get the merge commit details
success, commit = _get_github_api(f"commits/{merge_commit_sha}", headers, org, repo)
if not success or not commit:
return False
# For squash and merge, the commit will have exactly one parent
# and the commit message typically includes the PR number
parents = commit.get("parents", [])
commit_message = commit.get("commit", {}).get("message", "")
# Squash and merge commits have exactly 1 parent (the base branch)
# Regular merge commits have 2 parents (base and head branches)
if len(parents) == 1 and f"#{pr_number}" in commit_message:
return True
return False
def verify() -> bool:
"""
Programmatically verify that the release management workflow meets the
requirements described in description.md.
"""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
# Configuration constants
RELEASE_BRANCH = "release-v1.1.0"
# Expected content checks with minimum file sizes to ensure files aren't just stubs
METASEP_FIX = 'FormattingToken::MetaSep => "<|meta_sep|>"'
REGISTRY_FIX = '(FormattingToken::MetaSep, "<|meta_sep|>")'
METAEND_FIX = '(FormattingToken::MetaEnd, "<|meta_end|>")'
UTILS_CONTENT = "export function cn(...inputs: ClassValue[])"
GITIGNORE_ADDITION = "!demo/harmony-demo/src/lib"
VERSION_110 = 'version = "1.1.0"'
CHANGELOG_KEYWORDS = [
"## [1.1.0] - 2025-08-07",
"MetaSep token mapping bug",
"shadcn utils.ts file",
"Fixed MetaSep token",
"Registry now properly recognizes",
]
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying GitHub release management workflow completion...")
# 1. Check release branch exists
print("1. Verifying release branch exists...")
if not _check_branch_exists(RELEASE_BRANCH, headers, github_org):
print(f"Error: Branch '{RELEASE_BRANCH}' not found", file=sys.stderr)
return False
# 2. Check MetaSep fix in encoding.rs (with min content length to ensure file wasn't gutted)
print("2. Verifying MetaSep token fix in encoding.rs...")
if not _check_specific_file_content(
"main", "src/encoding.rs", METASEP_FIX, headers, github_org, min_length=500
):
print(
"Error: MetaSep token fix not found in src/encoding.rs or file is too small",
file=sys.stderr,
)
return False
# 3. Check registry updates (both MetaSep and MetaEnd)
print("3. Verifying MetaSep and MetaEnd registry additions...")
if not _check_specific_file_content(
"main", "src/registry.rs", REGISTRY_FIX, headers, github_org, min_length=500
):
print(
"Error: MetaSep registry fix not found in src/registry.rs or file is too small",
file=sys.stderr,
)
return False
if not _check_specific_file_content(
"main", "src/registry.rs", METAEND_FIX, headers, github_org, min_length=500
):
print(
"Error: MetaEnd registry fix not found in src/registry.rs", file=sys.stderr
)
return False
# 4. Check utils.ts file exists with correct content
print("4. Verifying shadcn utils.ts file...")
if not _check_specific_file_content(
"main",
"demo/harmony-demo/src/lib/utils.ts",
UTILS_CONTENT,
headers,
github_org,
min_length=50,
):
print("Error: utils.ts file not found or incorrect content", file=sys.stderr)
return False
# 5. Check .gitignore update
print("5. Verifying .gitignore update...")
if not _check_specific_file_content(
"main", ".gitignore", GITIGNORE_ADDITION, headers, github_org, min_length=100
):
print("Error: .gitignore update not found", file=sys.stderr)
return False
# 6. Check version update in Cargo.toml only (pyproject.toml uses dynamic versioning)
print("6. Verifying version update in Cargo.toml...")
if not _check_specific_file_content(
"main", "Cargo.toml", VERSION_110, headers, github_org, min_length=200
):
print("Error: Version 1.1.0 not found in Cargo.toml", file=sys.stderr)
return False
# 7. Check CHANGELOG.md exists with required content
print("7. Verifying CHANGELOG.md...")
if not _check_file_content(
"main", "CHANGELOG.md", CHANGELOG_KEYWORDS, headers, github_org
):
print(
"Error: CHANGELOG.md not found or missing required content", file=sys.stderr
)
return False
# 8. Check release PR was merged and get PR number
print("8. Verifying release pull request was merged...")
pr_merged, pr_number = _check_pr_merged(
"Release v1.1.0", "main", headers, github_org
)
if not pr_merged:
print("Error: Release pull request not found or not merged", file=sys.stderr)
return False
# 9. Check PR was merged using squash and merge
print("9. Verifying pull request was merged using 'squash and merge' method...")
if pr_number and not _check_pr_squash_merged(pr_number, headers, github_org):
print(
f"Error: Pull request #{pr_number} was not merged using 'squash and merge' method",
file=sys.stderr,
)
return False
print("\n✓ All verification checks passed!")
print("Release management workflow completed successfully.")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/description.md
================================================
I need you to create a Deployment Status workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building a deployment-focused CI/CD workflow from scratch that responds to push events on the main branch. Here's what needs to be implemented:
## Deployment Status Workflow
Create `.github/workflows/deployment-status.yml` that triggers on `push` to `main` branch with these sequential jobs:
### 1. **pre-deployment** job (name: `pre-deployment`):
- Runs basic quality checks (lint and test)
- Creates deployment tracking issue with title: "Deployment Tracking - [commit-sha]"
- Adds labels: `deployment`, `in-progress`
- Captures previous commit SHA and package version information
- Posts comment containing "Pre-deployment checks completed"
### 2. **rollback-preparation** job (name: `rollback-preparation`):
- Depends on: pre-deployment
- Creates comprehensive rollback artifacts including:
* Executable rollback script with proper error handling
* Configuration backups (package.json, package-lock.json, environment templates)
* Dependency verification script for compatibility checking
* Detailed rollback documentation with step-by-step instructions
* Compressed rollback package with SHA256 checksums
- Uploads rollback artifacts to GitHub Actions with 30-day retention
- Posts comment on deployment issue that MUST contain the following verifiable elements:
* Title: "🔄 Rollback Plan Ready"
* Previous commit SHA (format: "Previous Commit: [sha]")
* Current commit SHA (format: "Current Commit: [sha]")
* Package version (format: "Package Version: [version]")
* Artifact name (format: "Artifact: rollback-package-[commit-sha]")
* At least 5 checkmarks (✅) indicating completed rollback components
* Quick rollback command section with bash code block
* Script verification status: "Rollback script created: true"
* Backup verification status: "Configuration backup: true"
* Artifact checksum (format: "SHA256: [checksum-value]")
### 3. **post-deployment** job (name: `post-deployment`):
- Depends on: rollback-preparation
- Removes `in-progress` label and adds `completed` label
- Posts final comment containing "Deployment Completed Successfully" with rollback artifact details
- Closes the deployment tracking issue
## Implementation Requirements:
**Step 1: Create Feature Branch**
Create a new branch called `deployment-status-workflow` from main.
**Step 2: Implement the Workflow**
Create `.github/workflows/deployment-status.yml` with proper YAML syntax:
- Trigger only on push to main branch
- Sequential job execution: pre-deployment → rollback-preparation → post-deployment
- Use github-script actions for issue management
- Avoid identifier conflicts in github-script actions (don't redeclare 'github')
- Include proper error handling and script validation
- Implement comprehensive rollback artifact creation and verification
- Use proper fetch-depth for accessing commit history
- Include artifact upload/download capabilities with checksums
**Step 3: Create and Merge Pull Request**
Create a comprehensive pull request and merge it to main:
- Title: "Implement Deployment Status Workflow"
- Detailed description of the workflow and its purpose
- Merge the pull request to main branch to trigger the deployment workflow
================================================
FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/meta.json
================================================
{
"task_id": "deployment_status_workflow",
"task_name": "Deployment Status Workflow",
"category_id": "mcpmark-cicd",
"category_name": "MCPMark CI/CD",
"description": "Create deployment status workflow with pre-deployment checks, rollback preparation, and comprehensive issue tracking for deployments.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"ci/cd automation",
"workflow automation"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/github/standard/mcpmark-cicd/deployment_status_workflow/verify.py
================================================
import sys
import os
import requests
import time
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _search_github_issues(
query: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[List]]:
"""Search GitHub issues using the search API."""
url = f"https://api.github.com/search/issues?q={query}&per_page=100"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return True, data.get("items", [])
else:
print(f"Search API error: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Search exception: {e}", file=sys.stderr)
return False, None
def _wait_for_workflow_completion(
headers: Dict[str, str], owner: str, repo: str, max_wait: int = 90
) -> bool:
"""Wait for GitHub Actions workflows to complete processing."""
print("⏳ Waiting for deployment status workflows to complete...")
start_time = time.time()
no_workflow_check_count = 0
while time.time() - start_time < max_wait:
try:
# Check workflow runs for deployment-status.yml
success, response = _get_github_api(
"actions/workflows/deployment-status.yml/runs?per_page=10",
headers,
owner,
repo,
)
if success and response:
runs = response.get("workflow_runs", [])
if len(runs) > 0:
# Check status of recent runs
running_count = 0
completed_count = 0
failed_count = 0
for run in runs[:3]: # Check recent runs
status = run["status"]
conclusion = run.get("conclusion")
if status == "completed":
completed_count += 1
if conclusion == "failure":
failed_count += 1
elif status in ["in_progress", "queued"]:
running_count += 1
print(
f" Status: {completed_count} completed, {running_count} running/queued"
)
# Wait until NO workflows are running
if running_count == 0:
if failed_count > 0:
print(
f"⚠️ Warning: {failed_count} workflow runs failed, but continuing verification..."
)
print(
f"✅ All workflows completed. Found {completed_count} completed runs."
)
# Additional wait to ensure all processing is done
print(
"⏳ Additional wait for deployment processing to complete..."
)
time.sleep(5)
return True
else:
# No workflow runs found
no_workflow_check_count += 1
if no_workflow_check_count == 1:
print(
" No workflow runs found yet, waiting 5 seconds and checking once more..."
)
time.sleep(5)
continue
elif no_workflow_check_count >= 2:
print(
"⚠️ No workflow runs detected after 2 checks. Workflow may not have been triggered."
)
print(" Continuing with verification...")
return False
print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)")
time.sleep(5)
except Exception as e:
print(f"⚠️ Error checking workflow status: {e}")
time.sleep(5)
print(f"⚠️ Workflow completion wait timed out after {max_wait}s")
return False
def _verify_workflow_runs(
headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str], Optional[Dict]]:
"""Verify that the deployment status workflow runs have the correct jobs."""
print("\n⚙️ Verifying deployment status workflow runs...")
errors = []
# Get the most recent workflow run
success, response = _get_github_api(
"actions/workflows/deployment-status.yml/runs?per_page=5",
headers,
owner,
repo,
)
if not success or not response:
return False, ["Failed to fetch workflow runs"], None
runs = response.get("workflow_runs", [])
if not runs:
return False, ["No workflow runs found for deployment-status.yml"], None
# Find the most recent successful run
latest_successful_run = None
for run in runs:
if run.get("conclusion") == "success":
latest_successful_run = run
break
if not latest_successful_run:
return False, ["No successful workflow runs found"], None
run_id = latest_successful_run["id"]
print(f" Found successful workflow run #{run_id}")
# Get jobs for this run
success, jobs_response = _get_github_api(
f"actions/runs/{run_id}/jobs", headers, owner, repo
)
if not success:
return False, ["Failed to fetch workflow jobs"], None
jobs = jobs_response.get("jobs", [])
expected_jobs = ["pre-deployment", "rollback-preparation", "post-deployment"]
found_jobs = [job["name"] for job in jobs]
missing_jobs = [job for job in expected_jobs if job not in found_jobs]
if missing_jobs:
errors.append(f"Missing jobs: {missing_jobs}. Found: {found_jobs}")
else:
print(f" ✅ All 3 required jobs found: {found_jobs}")
# Verify all jobs succeeded
failed_jobs = [job["name"] for job in jobs if job["conclusion"] != "success"]
if failed_jobs:
errors.append(f"Failed jobs: {failed_jobs}")
else:
print(" ✅ All jobs completed successfully")
# Verify sequential execution (each job should start after the previous one)
if len(jobs) >= 3:
job_times = {}
for job in jobs:
if job["name"] in expected_jobs and job["started_at"]:
job_times[job["name"]] = job["started_at"]
if len(job_times) >= 3:
# Check that jobs ran in correct sequence
import datetime
times = {
name: datetime.datetime.fromisoformat(time.replace("Z", "+00:00"))
for name, time in job_times.items()
}
# pre-deployment should start first
# rollback-preparation should start after pre-deployment
# post-deployment should start after rollback-preparation
if all(job in times for job in expected_jobs):
if (
times["rollback-preparation"] <= times["pre-deployment"]
or times["post-deployment"] <= times["rollback-preparation"]
):
errors.append("Jobs did not run in correct sequential order")
else:
print(" ✅ Jobs ran in correct sequential order")
else:
errors.append(
"Not enough job timing data to verify sequential execution"
)
return len(errors) == 0, errors, latest_successful_run
def _verify_deployment_issue(
run_data: Dict, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
"""Verify that a deployment tracking issue was created and closed properly."""
print("\n📋 Verifying deployment tracking issue...")
errors = []
# Extract commit SHA from the workflow run
head_sha = run_data.get("head_sha")
if not head_sha:
return False, ["Could not determine head SHA from workflow run"]
short_sha = head_sha[:7]
expected_title = f"Deployment Tracking - {short_sha}"
# Search for the deployment tracking issue
success, issues = _search_github_issues(
f'repo:{owner}/{repo} "{expected_title}" is:issue', headers
)
if not success:
return False, ["Failed to search for deployment tracking issue"]
# Find the exact issue
deployment_issue = None
for issue in issues:
if issue.get("title") == expected_title:
deployment_issue = issue
break
if not deployment_issue:
return False, [f"Deployment tracking issue '{expected_title}' not found"]
issue_number = deployment_issue["number"]
print(f" Found deployment tracking issue #{issue_number}: {expected_title}")
# Check that issue is closed
if deployment_issue.get("state") != "closed":
errors.append(
f"Deployment issue #{issue_number} is not closed (state: {deployment_issue.get('state')})"
)
else:
print(f" ✅ Deployment issue #{issue_number} is closed")
# Check required labels
expected_labels = ["deployment", "completed"]
actual_labels = [label["name"] for label in deployment_issue.get("labels", [])]
missing_labels = [label for label in expected_labels if label not in actual_labels]
if missing_labels:
errors.append(
f"Missing labels on deployment issue: {missing_labels}. Found: {actual_labels}"
)
else:
print(f" ✅ Required labels found: {expected_labels}")
# Get issue comments to verify GitHub Actions bot comments
success, comments = _get_github_api(
f"issues/{issue_number}/comments", headers, owner, repo
)
if not success:
errors.append("Failed to get deployment issue comments")
return len(errors) == 0, errors
# Filter for GitHub Actions bot comments only
bot_comments = [
comment
for comment in comments
if comment.get("user", {}).get("login") == "github-actions[bot]"
]
if not bot_comments:
errors.append("No comments found from GitHub Actions bot")
return len(errors) == 0, errors
print(f" Found {len(bot_comments)} comment(s) from GitHub Actions bot")
# Get all bot comment bodies
bot_comment_bodies = [comment.get("body", "") for comment in bot_comments]
all_bot_comments = " ".join(bot_comment_bodies)
# Check for required GitHub Actions bot comment indicators
required_comment_indicators = [
"Pre-deployment checks completed",
"🔄 Rollback Plan Ready",
"Deployment Completed Successfully",
]
for indicator in required_comment_indicators:
if indicator not in all_bot_comments:
errors.append(
f"Missing required GitHub Actions bot comment indicator: '{indicator}'"
)
else:
print(f" ✅ Found GitHub Actions bot comment indicator: '{indicator}'")
# Find and verify the rollback plan comment from GitHub Actions bot
rollback_comment = None
for comment in bot_comments:
if "🔄 Rollback Plan Ready" in comment.get("body", ""):
rollback_comment = comment.get("body", "")
break
if rollback_comment:
print(" ✅ Found rollback plan comment from GitHub Actions bot")
# Check for required rollback plan elements
required_elements = [
"**Previous Commit**:",
"**Current Commit**:",
"**Package Version**:",
"✅ Executable rollback script created",
"✅ Configuration backups saved",
"✅ Dependency verification script prepared",
"✅ Comprehensive rollback documentation generated",
"✅ Compressed rollback package created",
"**SHA256**:",
"**Artifact**:",
"Quick Rollback Commands",
]
for element in required_elements:
if element not in rollback_comment:
errors.append(f"Missing element in rollback plan: '{element}'")
else:
print(f" ✅ Found rollback plan element: '{element}'")
# Verify commit SHAs in rollback comment
if f"**Current Commit**: {head_sha}" in rollback_comment:
print(f" ✅ Current commit SHA verified: {head_sha}")
else:
errors.append(
f"Current commit SHA {head_sha} not found in rollback comment"
)
# Extract and verify previous commit SHA
if "**Previous Commit**:" in rollback_comment:
import re
prev_sha_match = re.search(
r"\*\*Previous Commit\*\*:\s*([a-f0-9]{40})", rollback_comment
)
if prev_sha_match:
prev_sha = prev_sha_match.group(1)
print(f" ✅ Previous commit SHA found: {prev_sha}")
# Verify it's a valid 40-character SHA
if len(prev_sha) != 40:
errors.append(
f"Previous commit SHA has invalid length: {len(prev_sha)}"
)
else:
errors.append(
"Previous commit SHA format not found in rollback comment"
)
else:
errors.append("Previous commit SHA not found in rollback comment")
# Verify SHA256 checksum is present
sha256_match = re.search(r"\*\*SHA256\*\*:\s*([a-f0-9]{64})", rollback_comment)
if sha256_match:
sha256_value = sha256_match.group(1)
print(f" ✅ SHA256 checksum found: {sha256_value[:16]}...")
else:
errors.append(
"SHA256 checksum not found or invalid format in rollback comment"
)
else:
errors.append("Rollback plan comment not found from GitHub Actions bot")
return len(errors) == 0, errors
def verify() -> bool:
"""
Verify that the deployment status workflow automation is working correctly.
"""
# Load environment variables
load_dotenv(".mcp_env")
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
# Get GitHub organization
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
# Repository configuration
owner = github_org
repo = "mcpmark-cicd"
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("🔍 Starting Deployment Status Workflow Verification")
print("=" * 60)
# Wait for workflows to complete
workflows_completed = _wait_for_workflow_completion(headers, owner, repo)
if not workflows_completed:
print(
"⚠️ Warning: Workflows may still be running. Continuing with verification..."
)
# Verify workflow runs and jobs
all_passed = True
# 1. Verify workflow runs have correct jobs
runs_ok, runs_errors, run_data = _verify_workflow_runs(headers, owner, repo)
if not runs_ok:
all_passed = False
print("❌ Workflow Runs Verification Failed:")
for error in runs_errors:
print(f" - {error}")
else:
print("✅ Workflow Runs Verification Passed")
# 2. Verify deployment issue if workflow runs passed
if run_data:
issue_ok, issue_errors = _verify_deployment_issue(
run_data, headers, owner, repo
)
if not issue_ok:
all_passed = False
print("❌ Deployment Issue Verification Failed:")
for error in issue_errors:
print(f" - {error}")
else:
print("✅ Deployment Issue Verification Passed")
print("\n" + "=" * 60)
if all_passed:
print("🎉 All Deployment Status Workflow verifications PASSED!")
print("\n📋 Summary:")
print(
" ✅ Workflow runs with correct 3 sequential jobs: pre-deployment, rollback-preparation, post-deployment"
)
print(" ✅ Deployment tracking issue created and closed with proper labels")
print(" ✅ Issue contains rollback plan with all required elements")
print(" ✅ Previous and current commit SHAs are correctly tracked")
print(" ✅ All workflow automation comments are present")
print(
"\n🤖 The GitHub Actions deployment status workflow is working correctly!"
)
else:
print("❌ Deployment Status Workflow verification FAILED!")
print(" Some components did not meet the expected automation requirements.")
return all_passed
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/description.md
================================================
I need you to create an intelligent Issue Management automation workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building an issue-focused automation system from scratch that responds to issue events. Here's what needs to be implemented:
## Issue Management Workflow
Create `.github/workflows/issue-automation.yml` that triggers on `issues` events (opened, labeled) with these jobs:
### 1. **issue-triage** job:
- Auto-assigns category labels based on keywords in **issue title** (case-insensitive):
- Title contains "bug" → adds `bug` label
- Title contains "epic" → adds `epic` label
- Title contains "maintenance" → adds `maintenance` label
- Auto-assigns priority labels based on keywords in **issue title OR body** (case-insensitive, highest priority wins if multiple keywords found):
- "critical", "urgent", "production", "outage" → `priority-critical`
- "important", "high", "blocking" → `priority-high`
- "medium", "normal" → `priority-medium` (default if no priority keywords found)
- "low", "nice-to-have", "minor" → `priority-low`
- All issues get `needs-triage` label initially
### 2. **task-breakdown** job:
- For issues with a title containing "Epic", create exactly 4 sub-issues with the pattern: "[SUBTASK] [Original Title] - Task N: [Task Name]"
- Task names: 1. Requirements Analysis, 2. Design and Architecture, 3. Implementation, 4. Testing and Documentation
- Links sub-issues to parent using "Related to #[parent-number]" in sub-issue body
- Updates parent issue body with "## Epic Tasks" checklist linking to sub-issue numbers
- All sub-issues get `enhancement` and `needs-review` labels
### 3. **auto-response** job:
- Checks if the issue author is creating their first issue in this repository (not first on GitHub globally, but first in this specific repo)
- If first issue in repo: adds `first-time-contributor` label and posts welcome message
- Posts different responses based on issue type:
- `bug` issues: comment must contain "Bug Report Guidelines"
- `epic` issues: comment must contain "Feature Request Process"
- `maintenance` issues: comment must contain "Maintenance Guidelines"
- Sets milestone "v1.0.0" for `priority-high` and `priority-critical` issues
- Changes status from `needs-triage` to `needs-review` after response
## Label Management Requirements
The system must create and manage these specific labels:
### Category Labels:
- `bug` - Something isn't working
- `enhancement` - New feature or request
- `epic` - Large feature requiring multiple sub-tasks
- `maintenance` - Maintenance and housekeeping tasks
### Priority Labels:
- `priority-critical` - Critical priority issue
- `priority-high` - High priority issue
- `priority-medium` - Medium priority issue
- `priority-low` - Low priority issue
### Status Labels:
- `needs-triage` - Needs to be reviewed by maintainers
- `needs-review` - Awaiting review from maintainers
- `first-time-contributor` - Issue created by first-time contributor
## Implementation Requirements:
**Step 1: Create Feature Branch**
Create a new branch called `issue-management-workflow` from main.
**Step 2: Create Supporting Files**
Create these additional files on the new branch:
- `.github/ISSUE_TEMPLATE/bug_report.md` - Bug report template
- `.github/ISSUE_TEMPLATE/feature_request.md` - Feature request template
- `.github/ISSUE_TEMPLATE/maintenance_report.md` - Maintenance report template
**Step 3: Implement the Workflow**
Create `.github/workflows/issue-automation.yml` with proper YAML syntax.
Include:
- Appropriate triggers for issues events
- Job dependencies where needed
- Error handling and graceful fallbacks
- Avoid identifier conflicts in github-script actions (don't redeclare 'github')
**Step 4: Create and Merge Pull Request**
Create a comprehensive pull request and merge it to main:
- Title: "Implement Issue Management Automation Workflow"
- Detailed description of the workflow and its purpose
- Include all workflow files and templates created
- Merge the pull request to main branch
**Step 5: Test the Workflow**
Create test issues to demonstrate the issue automation workflow:
1. **Bug Issue**: "Bug: Login form validation not working"
- Expected: `bug`, `priority-high`, `needs-triage`→`needs-review`, milestone "v1.0.0"
- Auto-response comment must contain "Bug Report Guidelines"
2. **Epic Issue**: "Epic: Redesign user dashboard interface"
- Expected: `epic`, `priority-high`, `needs-triage`→`needs-review`, milestone "v1.0.0"
- Must create 4 sub-issues with `enhancement` and `needs-review` labels
- Parent updated with "## Epic Tasks" checklist, sub-issues linked with "Related to #[parent-number]"
- Auto-response comment must contain "Feature Request Process"
3. **Maintenance Issue**: "Weekly maintenance cleanup and refactor"
- Expected: `maintenance`, `priority-medium`, `needs-triage`→`needs-review`, no milestone
- Auto-response comment must contain "Maintenance Guidelines"
================================================
FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/meta.json
================================================
{
"task_id": "issue_management_workflow",
"task_name": "Issue Management Workflow",
"category_id": "mcpmark-cicd",
"category_name": "MCPMark CI/CD",
"description": "Build intelligent issue management automation with auto-triage, task breakdown for epics, and first-time contributor handling.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"issue management",
"workflow automation"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/github/standard/mcpmark-cicd/issue_management_workflow/verify.py
================================================
import sys
import os
import requests
import time
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _search_github_issues(
query: str, headers: Dict[str, str]
) -> Tuple[bool, Optional[List]]:
"""Search GitHub issues using the search API."""
url = f"https://api.github.com/search/issues?q={query}&per_page=100"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
data = response.json()
return True, data.get("items", [])
else:
print(f"Search API error: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Search exception: {e}", file=sys.stderr)
return False, None
def _wait_for_workflow_completion(
headers: Dict[str, str], owner: str, repo: str, max_wait: int = 90
) -> bool:
"""Wait for GitHub Actions workflows to complete processing."""
print("⏳ Waiting for GitHub Actions workflows to complete...")
start_time = time.time()
expected_runs = 3 # We created 3 test issues
no_workflow_check_count = 0
while time.time() - start_time < max_wait:
try:
# Check workflow runs
success, response = _get_github_api(
"actions/workflows/issue-automation.yml/runs?per_page=20",
headers,
owner,
repo,
)
if success and response:
runs = response.get("workflow_runs", [])
if len(runs) >= expected_runs:
# Check status of recent runs
recent_runs = runs[:expected_runs]
running_count = 0
completed_count = 0
failed_count = 0
for run in recent_runs:
status = run["status"]
conclusion = run.get("conclusion")
if status == "completed":
completed_count += 1
if conclusion == "failure":
failed_count += 1
elif status in ["in_progress", "queued"]:
running_count += 1
print(
f" Status: {completed_count} completed, {running_count} running/queued"
)
# Wait until NO workflows are running and we have enough completed runs
if running_count == 0 and completed_count >= expected_runs:
if failed_count > 0:
print(
f"⚠️ Warning: {failed_count} workflow runs failed, but continuing verification..."
)
print(
f"✅ All workflows completed. Found {completed_count} completed runs."
)
# Additional wait to ensure all issue processing is done
print("⏳ Additional wait for issue processing to complete...")
time.sleep(5)
return True
elif len(runs) == 0:
# No workflow runs found
no_workflow_check_count += 1
if no_workflow_check_count == 1:
print(
" No workflow runs found yet, waiting 5 seconds and checking once more..."
)
time.sleep(5)
continue
elif no_workflow_check_count >= 2:
print(
"⚠️ No workflow runs detected after 2 checks. Workflow may not have been triggered."
)
print(" Continuing with verification...")
return False
else:
print(
f" Waiting for workflow runs... Found {len(runs)}, expected {expected_runs}"
)
print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)")
time.sleep(5)
except Exception as e:
print(f"⚠️ Error checking workflow status: {e}")
time.sleep(5)
print(f"⚠️ Workflow completion wait timed out after {max_wait}s")
return False
def _find_issue_by_title(
title: str, headers: Dict[str, str], owner: str, repo: str
) -> Optional[Dict]:
"""Find an issue by exact title match."""
success, issues = _search_github_issues(
f'repo:{owner}/{repo} "{title}" is:issue', headers
)
if success and issues:
for issue in issues:
if issue.get("title") == title:
return issue
return None
def _check_issue_labels(
issue: Dict, expected_labels: List[str]
) -> Tuple[bool, List[str]]:
"""Check if issue has the expected labels."""
actual_labels = [label["name"] for label in issue.get("labels", [])]
missing_labels = [label for label in expected_labels if label not in actual_labels]
if missing_labels:
return False, [f"Missing labels: {missing_labels}. Found: {actual_labels}"]
return True, []
def _check_issue_milestone(
issue: Dict, expected_milestone: str
) -> Tuple[bool, List[str]]:
"""Check if issue has the expected milestone."""
milestone = issue.get("milestone")
if not milestone:
if expected_milestone:
return False, [f"No milestone found. Expected: {expected_milestone}"]
return True, []
if milestone.get("title") != expected_milestone:
return False, [
f"Wrong milestone: {milestone.get('title')}. Expected: {expected_milestone}"
]
return True, []
def _check_issue_comments(
issue_number: int,
expected_content: str,
headers: Dict[str, str],
owner: str,
repo: str,
) -> Tuple[bool, List[str]]:
"""Check if issue has a comment containing expected content."""
success, comments = _get_github_api(
f"issues/{issue_number}/comments", headers, owner, repo
)
if not success:
return False, ["Failed to get issue comments"]
if not comments:
return False, [f"No comments found. Expected comment with: {expected_content}"]
for comment in comments:
if expected_content in comment.get("body", ""):
return True, []
return False, [f"Expected content '{expected_content}' not found in comments"]
def _find_epic_sub_issues(
parent_issue_number: int, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[List[Dict], List[str]]:
"""Find sub-issues created for an epic."""
# Search for each expected sub-task by exact title
expected_subtasks = [
"[SUBTASK] Epic: Redesign user dashboard interface - Task 1: Requirements Analysis",
"[SUBTASK] Epic: Redesign user dashboard interface - Task 2: Design and Architecture",
"[SUBTASK] Epic: Redesign user dashboard interface - Task 3: Implementation",
"[SUBTASK] Epic: Redesign user dashboard interface - Task 4: Testing and Documentation",
]
subtasks = []
errors = []
for expected_title in expected_subtasks:
# Search for exact title
success, issues = _search_github_issues(
f'repo:{owner}/{repo} "{expected_title}" is:issue', headers
)
if not success:
errors.append(f"Failed to search for sub-issue: {expected_title}")
continue
# Find exact match
found = False
for issue in issues:
if issue.get("title") == expected_title:
# Verify it references the parent issue
body = issue.get("body", "")
if (
f"#{parent_issue_number}" in body
or f"Related to #{parent_issue_number}" in body
):
subtasks.append(issue)
found = True
break
if not found:
errors.append(
f"Sub-issue not found or doesn't reference parent: {expected_title}"
)
return subtasks, errors
def _check_epic_checklist(
issue: Dict, subtask_numbers: List[int]
) -> Tuple[bool, List[str]]:
"""Check if epic issue has the Epic Tasks checklist with correct issue references."""
body = issue.get("body", "")
errors = []
if "## Epic Tasks" not in body:
return False, ["Epic Tasks section not found in issue body"]
# Check that all subtask issue numbers are referenced in checkbox format
for number in subtask_numbers:
# Check for checkbox format: - [ ] #number
if f"- [ ] #{number}" not in body:
errors.append(
f"Sub-issue #{number} not found in Epic Tasks checklist format (expected: '- [ ] #{number}')"
)
# Also verify the expected task names are present
expected_tasks = [
"Requirements Analysis",
"Design and Architecture",
"Implementation",
"Testing and Documentation",
]
for task in expected_tasks:
if task not in body:
errors.append(f"Task name '{task}' not found in Epic Tasks section")
if errors:
return False, errors
return True, []
def _verify_bug_issue(
headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
"""Verify the bug issue requirements."""
print("\n🐛 Verifying Bug Issue...")
errors = []
# Find bug issue
bug_issue = _find_issue_by_title(
"Bug: Login form validation not working", headers, owner, repo
)
if not bug_issue:
return False, ["Bug issue 'Bug: Login form validation not working' not found"]
issue_number = bug_issue["number"]
print(f" Found bug issue #{issue_number}")
# Check labels (including first-time-contributor since it's the first issue)
expected_labels = ["bug", "priority-high", "needs-review", "first-time-contributor"]
labels_ok, label_errors = _check_issue_labels(bug_issue, expected_labels)
if not labels_ok:
errors.extend(label_errors)
else:
print(f" ✅ Labels verified: {expected_labels}")
# Check milestone
milestone_ok, milestone_errors = _check_issue_milestone(bug_issue, "v1.0.0")
if not milestone_ok:
errors.extend(milestone_errors)
else:
print(" ✅ Milestone verified: v1.0.0")
# Check comment
comment_ok, comment_errors = _check_issue_comments(
issue_number, "Bug Report Guidelines", headers, owner, repo
)
if not comment_ok:
errors.extend(comment_errors)
else:
print(" ✅ Bug Report Guidelines comment found")
return len(errors) == 0, errors
def _verify_epic_issue(
headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
"""Verify the epic issue requirements."""
print("\n🚀 Verifying Epic Issue...")
errors = []
# Find epic issue
epic_issue = _find_issue_by_title(
"Epic: Redesign user dashboard interface", headers, owner, repo
)
if not epic_issue:
return False, ["Epic issue 'Epic: Redesign user dashboard interface' not found"]
issue_number = epic_issue["number"]
print(f" Found epic issue #{issue_number}")
# Check labels
expected_labels = ["epic", "priority-high", "needs-review"]
labels_ok, label_errors = _check_issue_labels(epic_issue, expected_labels)
if not labels_ok:
errors.extend(label_errors)
else:
print(f" ✅ Labels verified: {expected_labels}")
# Check milestone
milestone_ok, milestone_errors = _check_issue_milestone(epic_issue, "v1.0.0")
if not milestone_ok:
errors.extend(milestone_errors)
else:
print(" ✅ Milestone verified: v1.0.0")
# Check comment
comment_ok, comment_errors = _check_issue_comments(
issue_number, "Feature Request Process", headers, owner, repo
)
if not comment_ok:
errors.extend(comment_errors)
else:
print(" ✅ Feature Request Process comment found")
# Find and verify sub-issues
sub_issues, sub_errors = _find_epic_sub_issues(issue_number, headers, owner, repo)
if sub_errors:
errors.extend(sub_errors)
elif len(sub_issues) != 4:
errors.append(f"Expected 4 sub-issues, found {len(sub_issues)}")
else:
print(f" ✅ Found {len(sub_issues)} sub-issues")
# Collect sub-issue numbers for checklist verification
subtask_numbers = []
# Verify each sub-issue has correct labels and link to parent
for sub_issue in sub_issues:
sub_number = sub_issue["number"]
subtask_numbers.append(sub_number)
# Check labels
sub_labels = [label["name"] for label in sub_issue.get("labels", [])]
expected_sub_labels = ["enhancement", "needs-review"]
missing_sub_labels = [
label for label in expected_sub_labels if label not in sub_labels
]
if missing_sub_labels:
errors.append(
f"Sub-issue #{sub_number} missing labels: {missing_sub_labels}"
)
# Verify parent reference in body
sub_body = sub_issue.get("body", "")
if (
f"#{issue_number}" not in sub_body
and f"Related to #{issue_number}" not in sub_body
):
errors.append(
f"Sub-issue #{sub_number} doesn't reference parent issue #{issue_number}"
)
if not errors:
print(
" ✅ All 4 sub-tasks created with correct labels and parent references"
)
# Check Epic Tasks checklist with correct issue numbers
checklist_ok, checklist_errors = _check_epic_checklist(
epic_issue, subtask_numbers
)
if not checklist_ok:
errors.extend(checklist_errors)
else:
print(
f" ✅ Epic Tasks checklist verified with correct issue references: {subtask_numbers}"
)
return len(errors) == 0, errors
def _verify_maintenance_issue(
headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
"""Verify the maintenance issue requirements."""
print("\n🔧 Verifying Maintenance Issue...")
errors = []
# Find maintenance issue
maintenance_issue = _find_issue_by_title(
"Weekly maintenance cleanup and refactor", headers, owner, repo
)
if not maintenance_issue:
return False, [
"Maintenance issue 'Weekly maintenance cleanup and refactor' not found"
]
issue_number = maintenance_issue["number"]
print(f" Found maintenance issue #{issue_number}")
# Check labels
expected_labels = ["maintenance", "priority-medium", "needs-review"]
labels_ok, label_errors = _check_issue_labels(maintenance_issue, expected_labels)
if not labels_ok:
errors.extend(label_errors)
else:
print(f" ✅ Labels verified: {expected_labels}")
# Check NO milestone (maintenance issues shouldn't get v1.0.0)
milestone_ok, milestone_errors = _check_issue_milestone(maintenance_issue, None)
if not milestone_ok:
errors.extend(milestone_errors)
else:
print(" ✅ No milestone assigned (correct for maintenance issue)")
# Check comment
comment_ok, comment_errors = _check_issue_comments(
issue_number, "Maintenance Guidelines", headers, owner, repo
)
if not comment_ok:
errors.extend(comment_errors)
else:
print(" ✅ Maintenance Guidelines comment found")
return len(errors) == 0, errors
def verify() -> bool:
"""
Verify that the issue management workflow automation is working correctly.
"""
# Load environment variables
load_dotenv(".mcp_env")
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
# Get GitHub organization
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
# Repository configuration
owner = github_org
repo = "mcpmark-cicd"
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("🔍 Starting Issue Management Workflow Verification")
print("=" * 60)
# Wait for workflows to complete
workflows_completed = _wait_for_workflow_completion(headers, owner, repo)
if not workflows_completed:
print(
"⚠️ Warning: Workflows may still be running. Continuing with verification..."
)
# Verify each test issue
all_passed = True
# 1. Verify bug issue
bug_ok, bug_errors = _verify_bug_issue(headers, owner, repo)
if not bug_ok:
all_passed = False
print("❌ Bug Issue Verification Failed:")
for error in bug_errors:
print(f" - {error}")
else:
print("✅ Bug Issue Verification Passed")
# 2. Verify epic issue
epic_ok, epic_errors = _verify_epic_issue(headers, owner, repo)
if not epic_ok:
all_passed = False
print("❌ Epic Issue Verification Failed:")
for error in epic_errors:
print(f" - {error}")
else:
print("✅ Epic Issue Verification Passed")
# 3. Verify maintenance issue
maintenance_ok, maintenance_errors = _verify_maintenance_issue(headers, owner, repo)
if not maintenance_ok:
all_passed = False
print("❌ Maintenance Issue Verification Failed:")
for error in maintenance_errors:
print(f" - {error}")
else:
print("✅ Maintenance Issue Verification Passed")
print("\n" + "=" * 60)
if all_passed:
print("🎉 All Issue Management Workflow verifications PASSED!")
print("\n📋 Summary:")
print(
" ✅ Bug issue: labels (including first-time-contributor), milestone, and auto-response verified"
)
print(
" ✅ Epic issue: labels, milestone, 4 sub-issues with checklist, and correct issue references verified"
)
print(
" ✅ Maintenance issue: labels, no milestone, and auto-response verified"
)
print("\n🤖 The GitHub Actions workflow automation is working correctly!")
else:
print("❌ Issue Management Workflow verification FAILED!")
print(" Some issues did not meet the expected automation requirements.")
return all_passed
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/description.md
================================================
I need you to set up a proper linting workflow for our CI pipeline to ensure code quality standards are enforced on all pull requests. Here's what you need to do:
**Step 1: Create Linting Configuration Branch**
Create a new branch called 'ci/add-eslint-workflow' from the main branch.
**Step 2: Create ESLint Configuration**
On the new branch, create the file `.eslintrc.json` in the repository root with:
```json
{
"env": {
"browser": true,
"es2021": true,
"node": true
},
"extends": [
"eslint:recommended"
],
"parserOptions": {
"ecmaVersion": 12,
"sourceType": "module"
},
"rules": {
"no-unused-vars": "error",
"no-console": "warn",
"semi": ["error", "always"],
"quotes": ["error", "single"]
},
"ignorePatterns": ["node_modules/", "dist/", "build/"]
}
```
**Step 3: Create GitHub Actions Linting Workflow**
Create the file `.github/workflows/lint.yml` with:
- Workflow name: "Code Linting"
- Triggers on: push to main, pull_request events
- Uses ubuntu-latest runner
- Sets up Node.js version 18 using actions/setup-node
- Installs dependencies with npm ci
- Installs ESLint globally
- Runs ESLint on all JavaScript files in src/ directories
- Fails the workflow if linting errors are found
**Step 4: Create a File That Will Fail Linting**
Create the file `src/example.js` with intentional linting violations that will cause the CI check to fail.
**Step 5: Create Pull Request**
Commit all the changes (ESLint config, workflow file, and example file with linting errors) in a single commit, then create a pull request from 'ci/add-eslint-workflow' to 'main' with:
- Title: "Add ESLint workflow for code quality enforcement"
- Body must include:
- A "## Summary" heading describing the linting setup
- A "## Changes" heading listing the files added
- A "## Testing" heading explaining how to test the workflow
- Mention that the PR intentionally includes linting errors to demonstrate the workflow
**Step 6: Fix Linting Errors and Update PR**
Fix the linting errors in `src/example.js` and commit the changes in a single commit to update the PR so that the CI check passes.
================================================
FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/meta.json
================================================
{
"task_id": "linting_ci_workflow",
"task_name": "Linting Ci Workflow",
"category_id": "mcpmark-cicd",
"category_name": "MCPMark CI/CD",
"description": "Set up ESLint workflow for code quality enforcement on all pull requests with proper CI integration.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"ci/cd automation",
"pr workflows"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/github/standard/mcpmark-cicd/linting_ci_workflow/verify.py
================================================
import sys
import os
import requests
from typing import Dict, List, Optional, Tuple
import base64
from dotenv import load_dotenv
import time
import json
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "claude-code"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _check_branch_exists(
branch_name: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> bool:
"""Verify that a branch exists in the repository."""
success, _ = _get_github_api(f"branches/{branch_name}", headers, org, repo)
return success
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "claude-code",
ref: str = "main",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def _find_pr_by_title_keyword(
keyword: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> Optional[Dict]:
"""Find a PR by title keyword and return the PR data."""
for state in ["open", "closed"]:
success, prs = _get_github_api(
f"pulls?state={state}&per_page=100", headers, org, repo
)
if success and prs:
for pr in prs:
if keyword.lower() in pr.get("title", "").lower():
return pr
return None
def _get_workflow_runs_for_pr(
pr_number: int, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> List[Dict]:
"""Get workflow runs for a specific PR."""
success, runs = _get_github_api(
"actions/runs?event=pull_request&per_page=100", headers, org, repo
)
if not success or not runs:
return []
pr_runs = []
for run in runs.get("workflow_runs", []):
# Check if this run is associated with our PR
for pr in run.get("pull_requests", []):
if pr.get("number") == pr_number:
pr_runs.append(run)
break
return pr_runs
def _get_pr_commits(
pr_number: int, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> List[Dict]:
"""Get commits for a specific PR."""
success, commits = _get_github_api(f"pulls/{pr_number}/commits", headers, org, repo)
if not success or not commits:
return []
return commits
def _get_workflow_runs_for_commit(
commit_sha: str, headers: Dict[str, str], org: str, repo: str = "mcpmark-cicd"
) -> List[Dict]:
"""Get workflow runs for a specific commit."""
success, runs = _get_github_api(
f"actions/runs?head_sha={commit_sha}&per_page=100", headers, org, repo
)
if not success or not runs:
return []
return runs.get("workflow_runs", [])
def verify() -> bool:
"""
Programmatically verify that the ESLint CI workflow setup
meets the requirements described in description.md.
"""
# Configuration constants
BRANCH_NAME = "ci/add-eslint-workflow"
PR_KEYWORD = "eslint workflow"
# Expected files and their content checks
ESLINT_CONFIG_PATH = ".eslintrc.json"
WORKFLOW_PATH = ".github/workflows/lint.yml"
EXAMPLE_FILE_PATH = "src/example.js"
# Expected workflow content keywords
WORKFLOW_KEYWORDS = [
"Code Linting",
"ubuntu-latest",
"actions/setup-node",
"npm ci",
"eslint",
"src/",
]
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying ESLint CI workflow setup...")
# 1. Check that branch exists
print("1. Verifying CI branch exists...")
if not _check_branch_exists(BRANCH_NAME, headers, github_org):
print(f"Error: Branch '{BRANCH_NAME}' not found", file=sys.stderr)
return False
print("✓ CI branch created")
# 2. Check ESLint configuration file
print("2. Verifying .eslintrc.json...")
eslint_content = _get_file_content(
ESLINT_CONFIG_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME
)
if not eslint_content:
print("Error: .eslintrc.json not found", file=sys.stderr)
return False
# Validate ESLint config is valid JSON and contains required rules
try:
eslint_config = json.loads(eslint_content)
rules = eslint_config.get("rules", {})
required_rules = ["no-unused-vars", "semi", "quotes"]
missing_rules = [rule for rule in required_rules if rule not in rules]
if missing_rules:
print(
f"Error: .eslintrc.json missing rules: {missing_rules}", file=sys.stderr
)
return False
except json.JSONDecodeError:
print("Error: .eslintrc.json is not valid JSON", file=sys.stderr)
return False
print("✓ ESLint configuration created with proper rules")
# 3. Check GitHub Actions workflow file
print("3. Verifying .github/workflows/lint.yml...")
workflow_content = _get_file_content(
WORKFLOW_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME
)
if not workflow_content:
print("Error: .github/workflows/lint.yml not found", file=sys.stderr)
return False
# Check workflow contains required keywords
missing_keywords = [kw for kw in WORKFLOW_KEYWORDS if kw not in workflow_content]
if missing_keywords:
print(f"Error: Workflow missing keywords: {missing_keywords}", file=sys.stderr)
return False
# Check trigger configuration
if "pull_request" not in workflow_content or "push" not in workflow_content:
print("Error: Workflow missing proper triggers", file=sys.stderr)
return False
print("✓ GitHub Actions workflow created with proper configuration")
# 4. Check example file with linting errors initially exists
print("4. Verifying src/example.js...")
example_content = _get_file_content(
EXAMPLE_FILE_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME
)
if not example_content:
print("Error: src/example.js not found", file=sys.stderr)
return False
print("✓ Example file created")
# 5. Find and verify the linting PR
print("5. Verifying linting pull request...")
lint_pr = _find_pr_by_title_keyword(PR_KEYWORD, headers, github_org)
if not lint_pr:
# Try alternative keywords
lint_pr = _find_pr_by_title_keyword("eslint", headers, github_org)
if not lint_pr:
print("Error: Linting PR not found", file=sys.stderr)
return False
pr_body = lint_pr.get("body", "")
pr_number = lint_pr.get("number")
# Check PR body sections
required_sections = ["## Summary", "## Changes", "## Testing"]
missing_sections = [
section for section in required_sections if section not in pr_body
]
if missing_sections:
print(
f"Error: Linting PR missing sections: {missing_sections}", file=sys.stderr
)
return False
print("✓ Linting PR created with proper structure")
# 6. Check workflow runs and status changes
print("6. Verifying workflow execution and status...")
# First get the commits for this PR
commits = _get_pr_commits(pr_number, headers, github_org)
if len(commits) != 2:
print(
f"Error: Expected exactly 2 commits, found {len(commits)}", file=sys.stderr
)
return False
print("✓ Found exactly 2 commits as expected")
# Sort commits chronologically (oldest first)
commits.sort(key=lambda x: x.get("commit", {}).get("author", {}).get("date", ""))
first_commit_sha = commits[0].get("sha")
second_commit_sha = commits[1].get("sha")
print(f"First commit (should fail): {first_commit_sha[:7]}")
print(f"Second commit (should pass): {second_commit_sha[:7]}")
# Wait for workflows on both commits to complete
print("Waiting for workflow completion on first commit...")
first_commit_runs = []
second_commit_runs = []
start_time = time.time()
timeout = 90
no_workflow_check_count = 0
while time.time() - start_time < timeout:
first_commit_runs = _get_workflow_runs_for_commit(
first_commit_sha, headers, github_org
)
second_commit_runs = _get_workflow_runs_for_commit(
second_commit_sha, headers, github_org
)
# Check if any workflows exist
if not first_commit_runs and not second_commit_runs:
no_workflow_check_count += 1
if no_workflow_check_count == 1:
print(
"No workflow runs found yet, waiting 5 seconds and checking once more..."
)
time.sleep(5)
continue
elif no_workflow_check_count >= 2:
print(
"⚠️ No workflow runs detected after 2 checks. Workflows may not have been triggered."
)
print(" Continuing with verification...")
break
# Check if workflows are completed
first_completed = any(
run.get("status") == "completed" for run in first_commit_runs
)
second_completed = any(
run.get("status") == "completed" for run in second_commit_runs
)
if first_completed and second_completed:
break
print("Waiting for workflows to complete...")
time.sleep(10)
# Verify first commit workflow failed
first_commit_status = None
for run in first_commit_runs:
if run.get("status") == "completed":
conclusion = run.get("conclusion")
if conclusion in ["failure", "cancelled"]:
first_commit_status = "failed"
print("✓ First commit workflow failed as expected")
break
elif conclusion == "success":
first_commit_status = "passed"
break
if first_commit_status != "failed":
print(
"Error: First commit workflow should have failed due to linting errors",
file=sys.stderr,
)
return False
# Verify second commit workflow succeeded
second_commit_status = None
for run in second_commit_runs:
if run.get("status") == "completed":
conclusion = run.get("conclusion")
if conclusion == "success":
second_commit_status = "passed"
print("✓ Second commit workflow passed as expected")
break
elif conclusion in ["failure", "cancelled"]:
second_commit_status = "failed"
break
if second_commit_status != "passed":
print(
"Error: Second commit workflow should have passed after fixing linting errors",
file=sys.stderr,
)
return False
print(
"✓ Workflow status sequence verified: first commit failed → second commit passed"
)
# 7. Verify the final state shows clean code
print("7. Verifying final file state...")
final_example_content = _get_file_content(
EXAMPLE_FILE_PATH, headers, github_org, "mcpmark-cicd", BRANCH_NAME
)
if final_example_content:
# Check that obvious linting errors are fixed
if (
"unusedVariable" in final_example_content
or 'console.log("Hello World")' in final_example_content
):
print(
"Warning: Example file may still contain linting errors",
file=sys.stderr,
)
else:
print("✓ Linting errors appear to be fixed")
print("\n✅ All verification checks passed!")
print("ESLint CI workflow setup completed successfully:")
print(f" - Linting PR #{pr_number}")
print(f" - Branch: {BRANCH_NAME}")
print(
" - Files created: .eslintrc.json, .github/workflows/lint.yml, src/example.js"
)
print(" - Workflow configured for pull_request and push triggers")
print(
f" - Total workflow runs found: {len(first_commit_runs) + len(second_commit_runs)}"
)
print(
f" - First commit runs: {len(first_commit_runs)}, Second commit runs: {len(second_commit_runs)}"
)
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/description.md
================================================
I need you to create a comprehensive Pull Request automation workflow for this Node.js project. The project currently has no GitHub Actions workflows, so you'll be building a PR-focused CI/CD workflow from scratch that responds to pull request events. Here's what needs to be implemented:
## Pull Request Automation Workflow
Create `.github/workflows/pr-automation.yml` that triggers on `pull_request` events (opened, synchronize, reopened) with these jobs:
### 1. **code-quality** job (name: `code-quality`):
- Runs ESLint checks using `npm run lint`
- Runs Prettier formatting checks
- Posts code quality results as PR comment (must include keywords: "Code Quality Report", "ESLint", "Prettier")
### 2. **testing-suite** job (name: `testing-suite`):
- Runs full test suite with `npm test`
- Generates test coverage report
- Posts coverage summary as PR comment (must include keywords: "Test Coverage Report")
- Uploads coverage artifacts
### 3. **security-scan** job (name: `security-scan`):
- Runs dependency vulnerability checks
- Scans for secrets in code changes
- Creates security report as PR comment (must include keywords: "Security Scan Report", "Vulnerabilities", "Dependencies")
### 4. **build-validation** job (name: `build-validation`):
- Attempts to build the application
- Validates all endpoints are accessible
- Creates deployment preview artifacts
- Posts build status as PR comment (must include keywords: "Build Validation")
**IMPORTANT: All four jobs must run in parallel.**
## Implementation Requirements:
**Step 1: Create Feature Branch**
Create a new branch called `pr-automation-workflow` from main.
**Step 2: Create the Workflow**
Create `.github/workflows/pr-automation.yml` with proper YAML syntax:
- Appropriate triggers for pull_request events
- All four jobs configured to run in parallel
- Avoid identifier conflicts in github-script actions
**Step 3: Create and Merge Pull Request**
Create a comprehensive pull request and merge it to main:
- Title: "Implement Pull Request Automation Workflow"
- Detailed description of the workflow and its purpose
- Merge the pull request to main branch
## Important Notes:
- **All jobs MUST run in parallel**
- Ensure your PR satisfies ALL required checks
- The workflow should handle edge cases, have proper error recovery, and provide clear logging
================================================
FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/meta.json
================================================
{
"task_id": "pr_automation_workflow",
"task_name": "Pr Automation Workflow",
"category_id": "mcpmark-cicd",
"category_name": "MCPMark CI/CD",
"description": "Create comprehensive PR automation with parallel jobs for code quality, testing, security scanning, and build validation.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"pr workflows",
"ci/cd automation",
"workflow automation"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/mcpmark-cicd",
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/github/standard/mcpmark-cicd/pr_automation_workflow/verify.py
================================================
import sys
import os
import requests
import time
from typing import Dict, List, Optional, Tuple
from dotenv import load_dotenv
import base64
def _get_github_api(
endpoint: str, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _post_github_api(
endpoint: str, headers: Dict[str, str], owner: str, repo: str, data: Dict
) -> Tuple[bool, Optional[Dict]]:
"""Make a POST request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
try:
response = requests.post(url, headers=headers, json=data)
if response.status_code in [200, 201]:
return True, response.json()
else:
print(
f"API error for {endpoint}: {response.status_code} - {response.text}",
file=sys.stderr,
)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _patch_github_api(
endpoint: str, headers: Dict[str, str], owner: str, repo: str, data: Dict
) -> Tuple[bool, Optional[Dict]]:
"""Make a PATCH request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{owner}/{repo}/{endpoint}"
try:
response = requests.patch(url, headers=headers, json=data)
if response.status_code == 200:
return True, response.json()
else:
print(
f"API error for {endpoint}: {response.status_code} - {response.text}",
file=sys.stderr,
)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_file_content(
file_path: str,
headers: Dict[str, str],
owner: str,
repo: str,
ref: str = "main",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, owner, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def _find_pr_by_title(
title: str, headers: Dict[str, str], owner: str, repo: str
) -> Optional[Dict]:
"""Find a PR by exact title match."""
for state in ["closed", "open"]:
success, prs = _get_github_api(
f"pulls?state={state}&per_page=100", headers, owner, repo
)
if success and prs:
for pr in prs:
if pr.get("title") == title:
return pr
return None
def _wait_for_workflow_completion(
headers: Dict[str, str],
owner: str,
repo: str,
workflow_file: str,
max_wait: int = 90,
) -> bool:
"""Wait for GitHub Actions workflows to complete processing."""
print(f"⏳ Waiting for {workflow_file} workflows to complete...")
start_time = time.time()
no_workflow_check_count = 0
while time.time() - start_time < max_wait:
try:
success, response = _get_github_api(
f"actions/workflows/{workflow_file}/runs?per_page=10",
headers,
owner,
repo,
)
if success and response:
runs = response.get("workflow_runs", [])
if len(runs) > 0:
running_count = 0
completed_count = 0
for run in runs[:5]: # Check recent runs
status = run["status"]
if status == "completed":
completed_count += 1
elif status in ["in_progress", "queued"]:
running_count += 1
print(
f" Status: {completed_count} completed, {running_count} running/queued"
)
if running_count == 0:
print(f"✅ All {workflow_file} workflows completed.")
return True
else:
# No workflow runs found
no_workflow_check_count += 1
if no_workflow_check_count == 1:
print(
" No workflow runs found yet, waiting 5 seconds and checking once more..."
)
time.sleep(5)
continue
elif no_workflow_check_count >= 2:
print(
f"⚠️ No workflow runs detected after 2 checks. {workflow_file} may not have been triggered."
)
print(" Continuing with verification...")
return False
print(f"⏳ Still waiting... ({int(time.time() - start_time)}s elapsed)")
time.sleep(10)
except Exception as e:
print(f"⚠️ Error checking workflow status: {e}")
time.sleep(10)
print(f"⚠️ Workflow completion wait timed out after {max_wait}s")
return False
def _verify_workflow_file(
headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
"""Verify that the workflow file exists and has correct content."""
print("\n📄 Verifying workflow file...")
errors = []
workflow_content = _get_file_content(
".github/workflows/pr-automation.yml", headers, owner, repo
)
if not workflow_content:
return False, [
"Workflow file .github/workflows/pr-automation.yml not found in main branch"
]
print(" ✅ Workflow file exists in main branch")
# Verify required components
required_events = ["opened", "synchronize", "reopened"]
required_jobs = [
"code-quality",
"testing-suite",
"security-scan",
"build-validation",
]
if "pull_request:" not in workflow_content:
errors.append("Workflow missing pull_request trigger")
else:
print(" ✅ Pull request trigger found")
for event in required_events:
if event not in workflow_content:
errors.append(f"Missing event trigger: {event}")
if not errors:
print(f" ✅ Required events found: {required_events}")
for job in required_jobs:
if f"{job}:" not in workflow_content:
errors.append(f"Missing job: {job}")
if not errors:
print(f" ✅ All 4 required jobs found: {required_jobs}")
return len(errors) == 0, errors
def _verify_main_pr_merged(
headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str], Optional[Dict]]:
"""Verify that the main PR implementing the workflow was merged."""
print("\n🔍 Verifying main PR was merged...")
errors = []
pr = _find_pr_by_title(
"Implement Pull Request Automation Workflow", headers, owner, repo
)
if not pr:
return (
False,
["Main PR 'Implement Pull Request Automation Workflow' not found"],
None,
)
pr_number = pr["number"]
print(f" Found PR #{pr_number}")
if not pr.get("merged_at", False):
errors.append(f"PR #{pr_number} was not merged")
else:
print(f" ✅ PR #{pr_number} was merged")
if pr.get("head", {}).get("ref") != "pr-automation-workflow":
errors.append(f"PR #{pr_number} was not from pr-automation-workflow branch")
else:
print(" ✅ PR was from pr-automation-workflow branch")
if pr.get("base", {}).get("ref") != "main":
errors.append(f"PR #{pr_number} was not merged to main branch")
else:
print(" ✅ PR was merged to main branch")
return len(errors) == 0, errors, pr
def _verify_workflow_runs(
pr_data: Dict, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
"""Verify that workflow runs occurred for the PR and all 4 jobs ran in parallel."""
print("\n⚙️ Verifying workflow runs...")
errors = []
pr_number = pr_data["number"]
# Get workflow runs for the PR
success, runs_response = _get_github_api(
"actions/runs?event=pull_request&per_page=50", headers, owner, repo
)
if not success:
return False, ["Failed to fetch workflow runs"]
pr_runs = []
pr_head_sha = pr_data.get("head", {}).get("sha")
for run in runs_response.get("workflow_runs", []):
# Method 1: Check if this run is associated with the PR's head SHA
if pr_head_sha and run.get("head_sha") == pr_head_sha:
pr_runs.append(run)
continue
# Method 2: Check pull_requests field (may be empty for merged PRs)
for pr in run.get("pull_requests", []):
if pr.get("number") == pr_number:
pr_runs.append(run)
break
if not pr_runs:
# Try alternative approach: get runs by head branch
pr_head_ref = pr_data.get("head", {}).get("ref")
if pr_head_ref:
success, branch_runs = _get_github_api(
f"actions/runs?branch={pr_head_ref}&per_page=50", headers, owner, repo
)
if success:
pr_runs = branch_runs.get("workflow_runs", [])
if not pr_runs:
return False, [
f"No workflow runs found for PR #{pr_number} (head_sha: {pr_head_sha})"
]
print(f" Found {len(pr_runs)} workflow run(s) for PR #{pr_number}")
# Check the most recent run
latest_run = pr_runs[0] # GitHub returns runs in descending order by creation time
run_id = latest_run["id"]
if latest_run["conclusion"] != "success":
errors.append(
f"Latest workflow run {run_id} did not succeed (conclusion: {latest_run['conclusion']})"
)
else:
print(f" ✅ Latest workflow run {run_id} succeeded")
# Get jobs for this run
success, jobs_response = _get_github_api(
f"actions/runs/{run_id}/jobs", headers, owner, repo
)
if not success:
return False, ["Failed to fetch workflow jobs"]
jobs = jobs_response.get("jobs", [])
expected_jobs = [
"code-quality",
"testing-suite",
"security-scan",
"build-validation",
]
found_jobs = [job["name"] for job in jobs]
missing_jobs = [job for job in expected_jobs if job not in found_jobs]
if missing_jobs:
errors.append(f"Missing jobs: {missing_jobs}. Found: {found_jobs}")
else:
print(f" ✅ All 4 required jobs found: {found_jobs}")
# Verify all jobs succeeded
failed_jobs = [job["name"] for job in jobs if job["conclusion"] != "success"]
if failed_jobs:
errors.append(f"Failed jobs: {failed_jobs}")
else:
print(" ✅ All jobs completed successfully")
# Verify jobs ran in parallel (started around the same time)
if len(jobs) >= 4:
start_times = [job["started_at"] for job in jobs if job["started_at"]]
if len(start_times) >= 4:
# Check if all jobs started within 2 minutes of each other
import datetime
start_dt = [
datetime.datetime.fromisoformat(t.replace("Z", "+00:00"))
for t in start_times
]
time_diff = max(start_dt) - min(start_dt)
if time_diff.total_seconds() > 120: # 2 minutes
errors.append(
f"Jobs did not run in parallel (time span: {time_diff.total_seconds()}s)"
)
else:
print(" ✅ Jobs ran in parallel")
else:
errors.append("Not enough job start times to verify parallel execution")
return len(errors) == 0, errors
def _verify_pr_comments(
pr_data: Dict, headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
"""Verify that PR has required automation comments from GitHub Actions bot."""
print("\n💬 Verifying PR comments...")
errors = []
pr_number = pr_data["number"]
success, comments = _get_github_api(
f"issues/{pr_number}/comments", headers, owner, repo
)
if not success:
return False, ["Failed to fetch PR comments"]
# Filter for GitHub Actions bot comments only
bot_comments = [
comment
for comment in comments
if comment.get("user", {}).get("login") == "github-actions[bot]"
]
if not bot_comments:
return False, ["No comments found from GitHub Actions bot"]
print(f" Found {len(bot_comments)} comment(s) from GitHub Actions bot")
# Get all bot comment bodies
bot_comment_bodies = [comment.get("body", "") for comment in bot_comments]
# Define required automation reports with their keywords
required_reports = [
{
"name": "Code Quality Report",
"main_keywords": ["Code Quality Report"],
"sub_keywords": ["ESLint", "Prettier"],
"found": False,
},
{
"name": "Test Coverage Report",
"main_keywords": ["Test Coverage Report"],
"sub_keywords": [],
"found": False,
},
{
"name": "Security Scan Report",
"main_keywords": ["Security Scan Report"],
"sub_keywords": ["Vulnerabilities", "Dependencies"],
"found": False,
},
{
"name": "Build Validation Report",
"main_keywords": ["Build Validation"],
"sub_keywords": [],
"found": False,
},
]
# Check each bot comment for the required reports
for comment_body in bot_comment_bodies:
for report in required_reports:
# Check if this comment contains any of the main keywords for this report
if any(keyword in comment_body for keyword in report["main_keywords"]):
if not report["found"]: # Only mark as found once
report["found"] = True
print(f" ✅ Found {report['name']}")
# Verify sub-keywords are present in this specific comment
for sub_keyword in report["sub_keywords"]:
if sub_keyword not in comment_body:
errors.append(
f"Missing sub-keyword '{sub_keyword}' in {report['name']}"
)
else:
print(
f" ✅ Found sub-keyword '{sub_keyword}' in {report['name']}"
)
# Check if all required reports were found
for report in required_reports:
if not report["found"]:
errors.append(f"Missing {report['name']} from GitHub Actions bot")
# Verify we have exactly 4 automation reports
found_reports = sum(1 for report in required_reports if report["found"])
if found_reports != 4:
errors.append(f"Expected 4 automation reports, but found {found_reports}")
else:
print(" ✅ All 4 required automation reports found from GitHub Actions bot")
return len(errors) == 0, errors
def _create_test_pr(
title: str,
branch: str,
content: str,
file_path: str,
headers: Dict[str, str],
owner: str,
repo: str,
) -> Optional[int]:
"""Create a test PR with specific content designed to fail a check."""
print(f" Creating test PR: {title}")
# Create branch
success, main_ref = _get_github_api("git/ref/heads/main", headers, owner, repo)
if not success:
print(" ❌ Failed to get main branch reference")
return None
main_sha = main_ref["object"]["sha"]
branch_data = {"ref": f"refs/heads/{branch}", "sha": main_sha}
success, _ = _post_github_api("git/refs", headers, owner, repo, branch_data)
if not success:
# Branch might already exist, try to delete and recreate
print(f" Branch {branch} already exists, trying to delete and recreate...")
import requests
# Force delete existing branch
delete_url = (
f"https://api.github.com/repos/{owner}/{repo}/git/refs/heads/{branch}"
)
delete_response = requests.delete(delete_url, headers=headers)
if delete_response.status_code == 204:
print(f" Successfully deleted existing branch {branch}")
# Wait a moment for deletion to complete
import time
time.sleep(2)
# Try creating again
success, _ = _post_github_api("git/refs", headers, owner, repo, branch_data)
if not success:
print(f" ❌ Failed to create branch {branch} after cleanup")
return None
else:
print(f" ✅ Successfully created branch {branch} after cleanup")
else:
print(
f" ❌ Failed to delete existing branch {branch}: {delete_response.status_code}"
)
return None
# Create or update file
file_content = base64.b64encode(content.encode()).decode()
file_data = {
"message": f"Test commit for {title}",
"content": file_content,
"branch": branch,
}
# Check if file exists in main branch first
success, file_info = _get_github_api(
f"contents/{file_path}?ref=main", headers, owner, repo
)
if success and file_info:
# File exists, need SHA for update
file_data["sha"] = file_info["sha"]
print(f" File {file_path} exists, updating with SHA")
else:
print(f" Creating new file {file_path}")
# Use PUT method for file creation/update
url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
try:
import requests
response = requests.put(url, headers=headers, json=file_data)
if response.status_code in [200, 201]:
print(f" ✅ Successfully created/updated file {file_path}")
else:
print(
f" ❌ Failed to create/update file {file_path}: {response.status_code} - {response.text}"
)
return None
except Exception as e:
print(f" ❌ Exception creating file {file_path}: {e}")
return None
# Create PR
pr_data = {
"title": title,
"head": branch,
"base": "main",
"body": f"Test PR to validate that {title.split(':')[1].strip()} check fails correctly.",
}
success, pr_response = _post_github_api("pulls", headers, owner, repo, pr_data)
if not success:
print(" ❌ Failed to create PR")
return None
pr_number = pr_response["number"]
print(f" ✅ Created test PR #{pr_number}")
return pr_number
def _close_pr(pr_number: int, headers: Dict[str, str], owner: str, repo: str) -> bool:
"""Close a PR."""
success, _ = _patch_github_api(
f"pulls/{pr_number}", headers, owner, repo, {"state": "closed"}
)
return success
def _run_unit_tests(
headers: Dict[str, str], owner: str, repo: str
) -> Tuple[bool, List[str]]:
"""Create test PRs to verify workflow correctly fails on bad code."""
print("\n🧪 Running unit tests with failing PRs...")
errors = []
created_prs = []
test_cases = [
{
"title": "Test: Code Quality Failure",
"branch": "test-code-quality-fail",
"file_path": "src/lint-fail-test.js",
"content": "// This file contains intentional ESLint violations\nvar unused_variable = 'this will trigger unused-vars rule'\nconsole.log('missing semicolon - will trigger semi rule')\nconst badly_spaced = 'too many spaces'\nif(true){console.log('missing spaces around braces')}\nfunction unusedFunction() { return 'unused'; }\neeval('alert(\"dangerous eval\")');\nwith (Math) { var x = cos(3 * PI) + sin(LN10) }\nvar a = 1; var a = 2; // redeclared variable",
"expected_failure": "code-quality",
},
{
"title": "Test: Testing Suite Failure",
"branch": "test-testing-fail",
"file_path": "tests/fail-test.test.js",
"content": "const request = require('supertest');\n\ndescribe('Intentional Test Failures', () => {\n test('This test should always fail', () => {\n expect(2 + 2).toBe(5); // Intentionally wrong\n });\n \n test('Another failing test', () => {\n expect(true).toBe(false); // Intentionally wrong\n });\n \n test('Math failure', () => {\n expect(Math.max(1, 2, 3)).toBe(1); // Intentionally wrong\n });\n});",
"expected_failure": "testing-suite",
},
{
"title": "Test: Security Scan Failure",
"branch": "test-security-fail",
"file_path": "src/security-fail-test.js",
"content": "// This file contains patterns that should trigger secret detection\nconst hardcodedPassword = 'admin123password';\nconst fakeApiKey = 'sk_test_' + 'fake123key456here789';\nconst awsLikeKey = 'AKIA' + 'FAKEKEY7EXAMPLE';\nconst dbPassword = 'password' + '=' + 'supersecret123';\nconst tokenPattern = 'token' + '=' + 'ghp_1234567890abcdef';\n\n// These patterns should trigger secret detection\nconsole.log('Password:', hardcodedPassword);\nconsole.log('API Key:', fakeApiKey);\nconsole.log('AWS Key:', awsLikeKey);\nconsole.log('DB Password:', dbPassword);\nconsole.log('Token:', tokenPattern);\n\nmodule.exports = {\n password: hardcodedPassword,\n apiKey: fakeApiKey\n};",
"expected_failure": "security-scan",
},
{
"title": "Test: Build Validation Failure",
"branch": "test-build-fail",
"file_path": "src/build-fail-test.js",
"content": "// This file will cause build/startup failures\nconst express = require('express');\nconst nonExistentModule = require('this-module-does-not-exist-anywhere');\nconst anotherMissing = require('@fake/missing-package');\n\n// This will cause runtime errors during startup\nconst app = express();\n\n// Define a route that will cause issues\napp.get('/test', (req, res) => {\n // Try to use non-existent modules\n nonExistentModule.doSomething();\n anotherMissing.initialize();\n res.send('This should never work');\n});\n\n// Override the listen method to always fail\nconst originalListen = app.listen;\napp.listen = function(port, callback) {\n console.log('Attempting to start server...');\n // This will crash during build validation\n throw new Error('Intentional build failure for testing');\n};\n\nmodule.exports = app;",
"expected_failure": "build-validation",
},
]
for test_case in test_cases:
pr_number = _create_test_pr(
test_case["title"],
test_case["branch"],
test_case["content"],
test_case["file_path"],
headers,
owner,
repo,
)
if pr_number:
created_prs.append(pr_number)
else:
errors.append(f"Failed to create test PR: {test_case['title']}")
if created_prs:
print(f" Created {len(created_prs)} test PRs, waiting for workflows...")
# Wait a bit for workflows to start
time.sleep(5)
# Wait for workflows to complete
_wait_for_workflow_completion(
headers, owner, repo, "pr-automation.yml", max_wait=90
)
# Verify each test PR failed appropriately
for i, pr_number in enumerate(created_prs):
test_case = test_cases[i]
print(
f" Checking test PR #{pr_number} ({test_case['expected_failure']} failure)..."
)
# Get workflow runs for this PR
success, runs_response = _get_github_api(
"actions/runs?event=pull_request&per_page=20", headers, owner, repo
)
if success:
pr_runs = []
for run in runs_response.get("workflow_runs", []):
# Check pull_requests field
for pr in run.get("pull_requests", []):
if pr.get("number") == pr_number:
pr_runs.append(run)
break
# If no runs found via pull_requests, try matching by branch
if not pr_runs:
branch_name = test_case["branch"]
for run in runs_response.get("workflow_runs", []):
if run.get("head_branch") == branch_name:
pr_runs.append(run)
if pr_runs:
latest_run = pr_runs[0]
if latest_run["conclusion"] != "failure":
errors.append(
f"Test PR #{pr_number} should have failed but got: {latest_run['conclusion']}"
)
else:
print(f" ✅ Test PR #{pr_number} correctly failed")
else:
errors.append(f"No workflow runs found for test PR #{pr_number}")
# Clean up test PRs and branches
print(" Cleaning up test PRs and branches...")
for i, pr_number in enumerate(created_prs):
if _close_pr(pr_number, headers, owner, repo):
print(f" ✅ Closed test PR #{pr_number}")
else:
print(f" ⚠️ Failed to close test PR #{pr_number}")
# Delete test branch
branch_name = test_cases[i]["branch"]
import requests
url = f"https://api.github.com/repos/{owner}/{repo}/git/refs/heads/{branch_name}"
response = requests.delete(url, headers=headers)
if response.status_code == 204:
print(f" ✅ Deleted test branch {branch_name}")
else:
print(f" ⚠️ Failed to delete test branch {branch_name}")
return len(errors) == 0, errors
def verify() -> bool:
"""
Verify that the PR automation workflow is working correctly.
"""
load_dotenv(".mcp_env")
github_token = os.environ.get("MCP_GITHUB_TOKEN")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
owner = github_org
repo = "mcpmark-cicd"
headers = {
"Authorization": f"token {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("🔍 Starting PR Automation Workflow Verification")
print("=" * 60)
all_passed = True
# 1. Verify workflow file exists
workflow_ok, workflow_errors = _verify_workflow_file(headers, owner, repo)
if not workflow_ok:
all_passed = False
print("❌ Workflow File Verification Failed:")
for error in workflow_errors:
print(f" - {error}")
else:
print("✅ Workflow File Verification Passed")
# 2. Verify main PR was merged
pr_ok, pr_errors, pr_data = _verify_main_pr_merged(headers, owner, repo)
if not pr_ok:
all_passed = False
print("❌ Main PR Verification Failed:")
for error in pr_errors:
print(f" - {error}")
else:
print("✅ Main PR Verification Passed")
# 3. Verify workflow runs (only if PR verification passed)
if pr_ok and pr_data:
runs_ok, runs_errors = _verify_workflow_runs(pr_data, headers, owner, repo)
if not runs_ok:
all_passed = False
print("❌ Workflow Runs Verification Failed:")
for error in runs_errors:
print(f" - {error}")
else:
print("✅ Workflow Runs Verification Passed")
# 4. Verify PR comments
comments_ok, comments_errors = _verify_pr_comments(
pr_data, headers, owner, repo
)
if not comments_ok:
all_passed = False
print("❌ PR Comments Verification Failed:")
for error in comments_errors:
print(f" - {error}")
else:
print("✅ PR Comments Verification Passed")
# 5. Run unit tests with failing PRs
tests_ok, tests_errors = _run_unit_tests(headers, owner, repo)
if not tests_ok:
all_passed = False
print("❌ Unit Tests Failed:")
for error in tests_errors:
print(f" - {error}")
else:
print("✅ Unit Tests Passed")
print("\n" + "=" * 60)
if all_passed:
print("🎉 All PR Automation Workflow verifications PASSED!")
print("\n📋 Summary:")
print(" ✅ Workflow file exists with correct triggers and 4 parallel jobs")
print(" ✅ Main PR was merged from pr-automation-workflow to main")
print(" ✅ Workflow runs show all 4 jobs executed in parallel and succeeded")
print(" ✅ PR comments contain required automation reports")
print(" ✅ Unit tests confirmed workflow correctly fails on problematic code")
print("\n🤖 The GitHub Actions PR automation workflow is working correctly!")
else:
print("❌ PR Automation Workflow verification FAILED!")
print(" Some components did not meet the expected automation requirements.")
return all_passed
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/missing-semester/assign_contributor_labels/description.md
================================================
Assign assignees for each open issue and open PR by adding labels instead of using direct assignees. Only contributors who appeared in the past 100 commits are considered. First, collect all such contributors and identify the most frequent author among them. For each open issue or PR, assign using labels according to the following rules:
• If the comments mention an author with @username, add a label in the format assigned-username.
• If multiple authors are mentioned, add labels in the same format for all of them.
• If no authors are mentioned in the comments, add a label for the most frequent contributor from the past 100 commits, using the format assigned-username.
================================================
FILE: tasks/github/standard/missing-semester/assign_contributor_labels/meta.json
================================================
{
"task_id": "assign_contributor_labels",
"task_name": "Assign Contributor Labels",
"category_id": "missing-semester",
"category_name": "Missing Semester",
"description": "Assign labels to open issues and PRs based on contributors mentioned in comments or the most frequent contributor from past 100 commits, using assigned-username format.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"issue management",
"label automation",
"contributor analysis"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/missing-semester",
"stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
}
}
================================================
FILE: tasks/github/standard/missing-semester/assign_contributor_labels/verify.py
================================================
import sys
import os
import requests
from typing import Dict, Optional, Tuple, List
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_issue_labels(
issue_number: int,
headers: Dict[str, str],
org: str,
repo: str = "missing-semester"
) -> Optional[List[str]]:
"""Get labels for a specific issue/PR."""
success, result = _get_github_api(f"issues/{issue_number}", headers, org, repo)
if not success or not result:
return None
labels = result.get("labels", [])
return [label["name"] for label in labels]
def verify() -> bool:
"""
Programmatically verify that the labels were assigned correctly to issues and PRs.
"""
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
print("Verifying contributor labels assignment task completion...")
# Expected labels configuration
expected_labels = {
# Issues
9: ["assigned-jonhoo", "assigned-anishathalye"], # Issue #9
14: ["assigned-jonhoo", "assigned-anishathalye"], # Issue #14
15: ["assigned-anishathalye"], # Issue #15
# PRs
21: ["assigned-anishathalye"], # PR #21
22: ["assigned-anishathalye"], # PR #22
23: ["assigned-anishathalye"], # PR #23
24: ["assigned-anishathalye"], # PR #24
}
all_passed = True
for item_number, expected in expected_labels.items():
item_type = "Issue" if item_number in [9, 14, 15] else "PR"
print(f"\nChecking {item_type} #{item_number}...")
labels = _get_issue_labels(item_number, headers, github_org, "missing-semester")
if labels is None:
print(f" ❌ Failed to retrieve {item_type} #{item_number}", file=sys.stderr)
all_passed = False
continue
# Sort both lists for comparison
labels_sorted = sorted(labels)
expected_sorted = sorted(expected)
if labels_sorted == expected_sorted:
print(f" ✅ {item_type} #{item_number} has correct labels: {labels_sorted}")
else:
print(f" ❌ {item_type} #{item_number} has incorrect labels", file=sys.stderr)
print(f" Expected: {expected_sorted}", file=sys.stderr)
print(f" Found: {labels_sorted}", file=sys.stderr)
all_passed = False
if all_passed:
print("\n✅ All verification checks passed!")
print("Contributor labels assignment task completed successfully:")
print(" - Issues #9 and #14 have both 'assigned-jonhoo' and 'assigned-anishathalye' labels")
print(" - Issue #15 and all 4 open PRs have 'assigned-anishathalye' label")
else:
print("\n❌ Some verification checks failed", file=sys.stderr)
return all_passed
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/missing-semester/find_legacy_name/description.md
================================================
I remember that a long time ago, *The Missing Semester of Your CS Education* had a different name and domain. There should be some related commit history. Please find the old name and domain and create an **ANSWER.md** file with them, formatted as:
[title](url)
Then push the file to the `master` branch.
================================================
FILE: tasks/github/standard/missing-semester/find_legacy_name/meta.json
================================================
{
"task_id": "find_legacy_name",
"task_name": "Find Legacy Name",
"category_id": "missing-semester",
"category_name": "Missing Semester",
"description": "Find the old name and domain of The Missing Semester course from commit history and document the findings.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"repository analysis"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/missing-semester",
"stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
}
}
================================================
FILE: tasks/github/standard/missing-semester/find_legacy_name/verify.py
================================================
import sys
import os
import requests
import base64
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "missing-semester",
ref: str = "master",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def verify() -> bool:
"""
Programmatically verify that the legacy name finding task was completed correctly.
Checks for ANSWER.md file in master branch with the correct content.
"""
# Expected answer content (accept both with and without trailing slash)
EXPECTED_CONTENTS = {
"[Hacker Tools](https://hacker-tools.github.io)",
"[Hacker Tools](https://hacker-tools.github.io/)",
}
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying legacy name finding task completion...")
# 1. Check that ANSWER.md exists in master branch
print("1. Checking ANSWER.md exists in master branch...")
answer_content = _get_file_content("ANSWER.md", headers, github_org, "missing-semester", "master")
if not answer_content:
print("Error: ANSWER.md not found in master branch", file=sys.stderr)
return False
print("✓ ANSWER.md found in master branch")
# 2. Check that the content matches expected answer
print("2. Verifying ANSWER.md content...")
answer_content = answer_content.strip()
if answer_content not in EXPECTED_CONTENTS:
print(f"Error: ANSWER.md content does not match expected answer(s)", file=sys.stderr)
print(f"Expected one of: {sorted(EXPECTED_CONTENTS)}", file=sys.stderr)
print(f"Found: {answer_content}", file=sys.stderr)
return False
print("✓ ANSWER.md contains correct legacy name and URL")
print("\n✅ All verification checks passed!")
print("Legacy name finding task completed successfully:")
print(f" - ANSWER.md created in master branch")
print(f" - Content accepted: {answer_content}")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/github/standard/missing-semester/find_salient_file/description.md
================================================
I want to know which file has been modified most frequently in the past 100 commits. However, I don't want to consider files related to GitHub Actions.
Please find the file and create an ANSWER.md, then write the file name in it.
================================================
FILE: tasks/github/standard/missing-semester/find_salient_file/meta.json
================================================
{
"task_id": "find_salient_file",
"task_name": "Find Salient File",
"category_id": "missing-semester",
"category_name": "Missing Semester",
"description": "Identify the most frequently modified file in the past 100 commits, excluding GitHub Actions related files, and create an ANSWER.md with the file name.",
"author": "Zijian Wu",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"commit analysis",
"file tracking",
"git history"
],
"mcp": [
"github"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://github.com/mcpmark-source/missing-semester",
"stateOriginalUrl": "https://github.com/missing-semester/missing-semester"
}
}
================================================
FILE: tasks/github/standard/missing-semester/find_salient_file/verify.py
================================================
import sys
import os
import requests
import base64
from typing import Dict, Optional, Tuple
from dotenv import load_dotenv
def _get_github_api(
endpoint: str, headers: Dict[str, str], org: str, repo: str = "missing-semester"
) -> Tuple[bool, Optional[Dict]]:
"""Make a GET request to GitHub API and return (success, response)."""
url = f"https://api.github.com/repos/{org}/{repo}/{endpoint}"
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return True, response.json()
elif response.status_code == 404:
return False, None
else:
print(f"API error for {endpoint}: {response.status_code}", file=sys.stderr)
return False, None
except Exception as e:
print(f"Exception for {endpoint}: {e}", file=sys.stderr)
return False, None
def _get_file_content(
file_path: str,
headers: Dict[str, str],
org: str,
repo: str = "missing-semester",
ref: str = "master",
) -> Optional[str]:
"""Get the content of a file from the repository."""
success, result = _get_github_api(
f"contents/{file_path}?ref={ref}", headers, org, repo
)
if not success or not result:
return None
try:
content = base64.b64decode(result.get("content", "")).decode("utf-8")
return content
except Exception as e:
print(f"Content decode error for {file_path}: {e}", file=sys.stderr)
return None
def verify() -> bool:
"""
Programmatically verify that the most frequently modified file was identified correctly.
Checks for ANSWER.md file in master branch with the correct content.
"""
# Expected answer content (excluding GitHub Actions files)
EXPECTED_CONTENT = "index.md"
# Load environment variables from .mcp_env
load_dotenv(".mcp_env")
# Get GitHub token and org
github_token = os.environ.get("MCP_GITHUB_TOKEN")
github_org = os.environ.get("GITHUB_EVAL_ORG")
if not github_token:
print("Error: MCP_GITHUB_TOKEN environment variable not set", file=sys.stderr)
return False
if not github_org:
print("Error: GITHUB_EVAL_ORG environment variable not set", file=sys.stderr)
return False
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github.v3+json",
}
# Run verification checks
print("Verifying salient file identification task completion...")
# 1. Check that ANSWER.md exists in master branch
print("1. Checking ANSWER.md exists in master branch...")
answer_content = _get_file_content("ANSWER.md", headers, github_org, "missing-semester", "master")
if not answer_content:
print("Error: ANSWER.md not found in master branch", file=sys.stderr)
return False
print("✅ ANSWER.md found in master branch")
# 2. Check that the content matches expected answer
print("2. Verifying ANSWER.md content...")
answer_content = answer_content.strip()
if answer_content != EXPECTED_CONTENT:
print(f"Error: ANSWER.md content does not match expected answer", file=sys.stderr)
print(f"Expected: {EXPECTED_CONTENT}", file=sys.stderr)
print(f"Found: {answer_content}", file=sys.stderr)
return False
print("✅ ANSWER.md contains correct filename")
print("\n✅ All verification checks passed!")
print("Salient file identification task completed successfully:")
print(f" - ANSWER.md created in master branch")
print(f" - Content: {EXPECTED_CONTENT}")
return True
if __name__ == "__main__":
success = verify()
sys.exit(0 if success else 1)
================================================
FILE: tasks/notion/easy/.gitkeep
================================================
================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/description.md
================================================
Find the page named "Computer Science Student Dashboard" and extend the **Code Snippets** section with Go content.
**Task Requirements:**
1. Add a bold paragraph that contains exactly the text `Go` to mark the start of the Go snippets.
2. Directly under that heading, add three code blocks configured with `language` set to **go**:
a. **Basic Go program** – Caption must be `Basic Go program` and the code content must be exactly:
```go
package main
import "fmt"
func main() {
fmt.Println("Hello, World!")
}
```
b. **For loop in Go** – Caption must be `For loop in Go` and the code content must be exactly:
```go
for i := 0; i < 5; i++ {
fmt.Println(i)
}
```
c. **Function definition in Go** – Caption must be `Function definition in Go` and the code content must be exactly:
```go
func add(a, b int) int {
return a + b
}
```
================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/meta.json
================================================
{
"task_id": "simple__code_snippets_go",
"task_name": "Simple Code Snippets Go",
"category_id": "computer_science_student_dashboard",
"category_name": "Computer Science Student Dashboard",
"description": "Add a new Go column to the Code Snippets section between Python and JavaScript columns.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content organization",
"visual formatting",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
}
}
================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__code_snippets_go/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
# Expected code blocks (language=go)
EXPECTED_CODE_BLOCKS = [
{
"caption": "Basic Go program",
"code": (
'package main\n\nimport "fmt"\n\nfunc main() {\n fmt.Println("Hello, World!")\n}'
),
},
{
"caption": "For loop in Go",
"code": ("for i := 0; i < 5; i++ {\n fmt.Println(i)\n}"),
},
{
"caption": "Function definition in Go",
"code": ("func add(a, b int) int {\n return a + b\n}"),
},
]
HEADER_TEXT = "Go"
def _normalize(text: str) -> str:
"""Remove trailing spaces on each line and strip leading/trailing blank lines."""
return "\n".join(line.rstrip() for line in text.strip().splitlines())
def _find_page(notion: Client, main_id: str | None) -> str | None:
"""Return a page_id to verify against or None if not found."""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard")
return page_id
def _has_bold_header_text(block, text: str) -> bool:
"""Generic bold header/paragraph check for a given text."""
block_type = block.get("type")
if block_type not in {"paragraph", "heading_1", "heading_2", "heading_3"}:
return False
rich_text_list = block.get(block_type, {}).get("rich_text", [])
if not rich_text_list:
return False
plain = "".join(rt.get("plain_text", "") for rt in rich_text_list).strip()
if plain != text:
return False
return any(rt.get("annotations", {}).get("bold", False) for rt in rich_text_list)
def _collect_code_blocks(blocks):
"""Return list of (code_content, caption) tuples for code blocks with language 'go'."""
collected = []
for block in blocks:
if block.get("type") != "code":
continue
code_data = block.get("code", {})
if code_data.get("language") != "go":
continue
code_plain = "".join(
rt.get("plain_text", "") for rt in code_data.get("rich_text", [])
)
caption_plain = "".join(
rt.get("plain_text", "") for rt in code_data.get("caption", [])
)
collected.append((code_plain, caption_plain))
return collected
def verify(notion: Client, main_id: str | None = None) -> bool:
page_id = _find_page(notion, main_id)
if not page_id:
print("Error: Target page not found.", file=sys.stderr)
return False
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# Verify header
header_ok = any(_has_bold_header_text(b, HEADER_TEXT) for b in all_blocks)
if not header_ok:
print("Failure: Bold header 'Go' not found.", file=sys.stderr)
return False
# Verify code blocks
code_blocks_found = _collect_code_blocks(all_blocks)
remaining = EXPECTED_CODE_BLOCKS.copy()
for code, caption in code_blocks_found:
norm_code = _normalize(code)
for expected in remaining:
if (
_normalize(expected["code"]) == norm_code
and expected["caption"] == caption
):
remaining.remove(expected)
break
if remaining:
missing = ", ".join(exp["caption"] for exp in remaining)
print(
f"Failure: Missing or incorrect Go code blocks: {missing}", file=sys.stderr
)
return False
print(
"Success: Verified Go header and required Go code blocks."
)
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
sys.exit(0 if verify(notion, main_id) else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/description.md
================================================
Create a new study-session entry on the **Computer Science Student Dashboard** page.
1. Locate the ☑️ Habit tracker section of the page.
2. **Insert a new date mention** for `2025-01-29` immediately **after the existing `2022-09-02` items but before the divider block** that follows them. Match the formatting of the existing dates (bold text with a Notion date mention).
================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/meta.json
================================================
{
"task_id": "simple__study_session_tracker",
"task_name": "Simple Study Session Tracker",
"category_id": "computer_science_student_dashboard",
"category_name": "Computer Science Student Dashboard",
"description": "Create a new study-session entry in the Habit tracker section with four unchecked to-do items.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content organization",
"visual formatting",
"status tracking"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
}
}
================================================
FILE: tasks/notion/easy/computer_science_student_dashboard/simple__study_session_tracker/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Verify that the new study-session entry for 2025-01-29 was added correctly.
The script checks that:
1. A bold date-mention with start=2025-01-29 exists.
2. The mention sits after the 2022-09-02 section but before the divider that originally
followed that section.
"""
# ---------------------------------------------------------------------
# Locate the main page -------------------------------------------------
# ---------------------------------------------------------------------
page_id: str | None = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard")
if not page_id:
print(
"Error: Page 'Computer Science Student Dashboard' not found.",
file=sys.stderr,
)
return False
# ---------------------------------------------------------------------
# Fetch all blocks under the page (flattened order) --------------------
# ---------------------------------------------------------------------
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# ---------------------------------------------------------------------
# Locate reference blocks ---------------------------------------------
# ---------------------------------------------------------------------
TARGET_DATE = "2025-01-29"
PREVIOUS_DATE = "2022-09-02"
index_previous_date: int | None = None
index_new_date: int | None = None
index_divider_after_previous: int | None = None
for idx, block in enumerate(all_blocks):
# Divider detection (we care only about the first divider that appears after
# the 2022-09-02 block)
if block.get("type") == "divider":
if index_previous_date is not None and index_divider_after_previous is None:
index_divider_after_previous = idx
# We only need to inspect paragraph blocks that contain a date mention
if block.get("type") != "paragraph":
continue
rich_text_list = block["paragraph"].get("rich_text", [])
for rt in rich_text_list:
if (
rt.get("type") != "mention"
or rt.get("mention", {}).get("type") != "date"
):
continue
date_start = rt["mention"]["date"].get("start")
if date_start == PREVIOUS_DATE and index_previous_date is None:
index_previous_date = idx
if date_start == TARGET_DATE and index_new_date is None:
index_new_date = idx
# (1) Verify bold annotation
if not rt.get("annotations", {}).get("bold", False):
print(
"Error: The 2025-01-29 date mention is not bold.",
file=sys.stderr,
)
return False
# Ensure all reference indices were found
if index_previous_date is None:
print("Error: Could not locate the 2022-09-02 date section.", file=sys.stderr)
return False
if index_divider_after_previous is None:
print(
"Error: Could not locate the divider that follows the 2022-09-02 section.",
file=sys.stderr,
)
return False
if index_new_date is None:
print(
"Error: Could not locate the new 2025-01-29 date mention.", file=sys.stderr
)
return False
# (2) Verify ordering
if not (index_previous_date < index_new_date < index_divider_after_previous):
print(
"Error: The 2025-01-29 section is positioned incorrectly.", file=sys.stderr
)
return False
# ---------------------------------------------------------------------
# Success --------------------------------------------------------------
# ---------------------------------------------------------------------
print("Success: Date mention for 2025-01-29 added in the correct position.")
return True
# -------------------------------------------------------------------------
# Command-line entry-point -------------------------------------------------
# -------------------------------------------------------------------------
def main() -> None:
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/description.md
================================================
Please migrate expiring assets out of the **IT Inventory** database using the simplified checklist below. Your changes will be verified automatically, so match the details exactly.
---
Task Steps
1. Inside the **IT Trouble Shooting Hub** page, locate the database named **IT Inventory**.
2. Collect every page in **IT Inventory** whose **Status** is **Expired** or **To be returned**.
3. Create a **new full-page database** under the same hub titled **IT Asset Retirement Queue** with exactly these properties (names and types must match):
• Serial – title
• Status – select
• Expiration date – date
4. For every item gathered in step 2, create a page in **IT Asset Retirement Queue**, copy over the Serial, Status, and Expiration date values, then archive the original inventory page once the copy is made.
================================================
FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/meta.json
================================================
{
"task_id": "simple__asset_retirement_migration",
"task_name": "Simple Asset Retirement Migration",
"category_id": "it_trouble_shooting_hub",
"category_name": "IT Trouble Shooting Hub",
"description": "Restructure the IT Inventory database by migrating expired assets to a new IT Asset Retirement Queue database.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"database manipulation",
"automated migration",
"conditional filtering",
"data aggregation",
"report generation"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
}
}
================================================
FILE: tasks/notion/easy/it_trouble_shooting_hub/simple__asset_retirement_migration/verify.py
================================================
import sys
from typing import Dict
from notion_client import Client
from tasks.utils import notion_utils
def _get_database(root_page_id: str, notion: Client, name: str) -> str | None:
"""Helper that finds a child database by title inside a page."""
return notion_utils.find_database_in_block(notion, root_page_id, name)
def _check_property(props: Dict, name: str, expected_type: str) -> bool:
if name not in props:
print(f"Error: Property '{name}' missing in database.", file=sys.stderr)
return False
if props[name]["type"] != expected_type:
print(
f"Error: Property '{name}' expected type '{expected_type}', found '{props[name]['type']}'.",
file=sys.stderr,
)
return False
return True
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Verifies that the IT Asset Retirement Queue was created and populated correctly."""
# -------------------------------------------------------------------------
# Resolve the root IT Trouble Shooting Hub page
# -------------------------------------------------------------------------
root_page_id = None
if main_id:
found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if found_id and obj_type == "page":
root_page_id = found_id
if not root_page_id:
root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub")
if not root_page_id:
print(
"Error: Could not locate the 'IT Trouble Shooting Hub' page.",
file=sys.stderr,
)
return False
# -------------------------------------------------------------------------
# Locate the original and new databases
# -------------------------------------------------------------------------
inventory_db_id = _get_database(root_page_id, notion, "IT Inventory")
if not inventory_db_id:
print("Error: 'IT Inventory' database not found.", file=sys.stderr)
return False
retirement_db_id = _get_database(root_page_id, notion, "IT Asset Retirement Queue")
if not retirement_db_id:
print("Error: 'IT Asset Retirement Queue' database not found.", file=sys.stderr)
return False
# -------------------------------------------------------------------------
# Validate schema of the retirement queue database
# -------------------------------------------------------------------------
retirement_db = notion.databases.retrieve(database_id=retirement_db_id)
r_props = retirement_db["properties"]
required_schema = {
"Serial": "title",
"Status": "select",
"Expiration date": "date",
}
for pname, ptype in required_schema.items():
if not _check_property(r_props, pname, ptype):
return False
# -------------------------------------------------------------------------
# Validate that inventory items are moved & archived
# -------------------------------------------------------------------------
expired_filter = {
"property": "Status",
"select": {"equals": "Expired"},
}
to_return_filter = {
"property": "Status",
"select": {"equals": "To be returned"},
}
compound_filter = {"or": [expired_filter, to_return_filter]}
# Query for any *active* items that still match these statuses
remaining_items = notion.databases.query(
database_id=inventory_db_id,
filter=compound_filter,
archived=False,
).get("results", [])
if remaining_items:
print(
f"Error: {len(remaining_items)} 'Expired' / 'To be returned' items still present in IT Inventory.",
file=sys.stderr,
)
return False
# There should be at least one entry in the retirement queue
retirement_pages = notion.databases.query(database_id=retirement_db_id).get(
"results", []
)
expected_serials = {"65XYQ/GB", "36x10PIQ"}
if len(retirement_pages) != len(expected_serials):
print(
f"Error: Expected {len(expected_serials)} retirement pages, found {len(retirement_pages)}.",
file=sys.stderr,
)
return False
serials_seen = set()
for page in retirement_pages:
props = page["properties"]
# Collect Serial title
title_rich = props.get("Serial", {}).get("title", [])
serial_val = "".join([t.get("plain_text", "") for t in title_rich]).strip()
serials_seen.add(serial_val)
if serials_seen != expected_serials:
print(
f"Error: Serial values mismatch. Expected {sorted(expected_serials)}, found {sorted(serials_seen)}.",
file=sys.stderr,
)
return False
print("Success: All verification criteria satisfied.")
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/description.md
================================================
Go to Japan Travel Planner, and go to the Travel Itineray database, and remove the itinerary in OSAKA after 6 PM (excluding 6 PM) in Day 1 and Day 2.
================================================
FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/meta.json
================================================
{
"task_id": "simple__remove_osaka_itinerary",
"task_name": "Simple Remove Osaka Itinerary",
"category_id": "japan_travel_planner",
"category_name": "Japan Travel Planner",
"description": "Remove the itinerary items in Osaka after 6 PM from Day 1 and Day 2 travel schedules.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"conditional filtering",
"automated migration"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
}
}
================================================
FILE: tasks/notion/easy/japan_travel_planner/simple__remove_osaka_itinerary/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def get_page_title(page_result):
"""Extract title from a page result"""
properties = page_result.get('properties', {})
name_property = properties.get('Name', {})
if name_property.get('type') == 'title':
title_array = name_property.get('title', [])
if title_array and len(title_array) > 0:
return title_array[0].get('plain_text', '')
return ''
def get_page_time(page_result):
"""Extract time from Notes field"""
properties = page_result.get('properties', {})
notes_property = properties.get('Notes', {})
if notes_property.get('type') == 'rich_text':
rich_text_array = notes_property.get('rich_text', [])
if rich_text_array and len(rich_text_array) > 0:
notes_text = rich_text_array[0].get('plain_text', '')
return notes_text.strip()
return ''
def get_page_group(page_result):
"""Extract group/location from page"""
properties = page_result.get('properties', {})
group_property = properties.get('Group', {})
if group_property.get('type') == 'select':
select = group_property.get('select')
if select:
return select.get('name', '')
return ''
def get_page_day(page_result):
"""Extract day from page"""
properties = page_result.get('properties', {})
day_property = properties.get('Day', {})
if day_property.get('type') == 'select':
select = day_property.get('select')
if select:
return select.get('name', '')
return ''
def parse_time_to_minutes(time_str):
"""Convert time string to minutes for comparison
Returns None if time cannot be parsed"""
if not time_str:
return None
# Clean the time string
time_str = time_str.strip().upper()
# Remove any text after the time (e.g., "7:30 PM\n" -> "7:30 PM")
time_str = time_str.split('\n')[0].strip()
# Extract time components
try:
if 'PM' in time_str:
time_part = time_str.replace('PM', '').strip()
if ':' in time_part:
hours, minutes = time_part.split(':')
hours = int(hours)
minutes = int(minutes)
else:
hours = int(time_part)
minutes = 0
# Convert PM hours (add 12 for PM times except 12 PM)
if hours != 12:
hours += 12
return hours * 60 + minutes
elif 'AM' in time_str:
time_part = time_str.replace('AM', '').strip()
if ':' in time_part:
hours, minutes = time_part.split(':')
hours = int(hours)
minutes = int(minutes)
else:
hours = int(time_part)
minutes = 0
# Handle 12 AM (midnight)
if hours == 12:
hours = 0
return hours * 60 + minutes
except:
return None
return None
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that all OSAKA events after 6PM have been removed from Day 1 and Day 2 in the Japan Travel Planner.
Expected items that should be deleted (all in OSAKA, after 6PM, on Day 1 or Day 2):
1. Rikuro's Namba Main Branch - 7 PM (Day 1)
2. Shin Sekai "New World" - 8 PM (Day 2)
3. Katsudon Chiyomatsu - 7:30 PM (Day 2)
4. Ebisubashi Bridge - 9 PM (Day 1)
Note: Kuromon Ichiba Market at 6 PM should NOT be deleted (it's at 6PM, not after)
Items after 6PM on other days (Day 3-8) should NOT be deleted
"""
# Step 1: Find the main Japan Travel Planner page
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != 'page':
print("Error: Japan Travel Planner page not found.", file=sys.stderr)
return False
else:
# Try to find the page by searching
found_id = notion_utils.find_page(notion, "Japan Travel Planner")
if not found_id:
print("Error: Japan Travel Planner page not found.", file=sys.stderr)
return False
print(f"Found Japan Travel Planner page: {found_id}")
# Step 2: Find the Travel Itinerary database
all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
travel_itinerary_db_id = None
for block in all_blocks:
if block and block.get("type") == "child_database":
title = block.get("child_database", {}).get("title", "")
if "Travel Itinerary" in title:
travel_itinerary_db_id = block.get("id")
print(f"Found Travel Itinerary database: {travel_itinerary_db_id}")
break
if not travel_itinerary_db_id:
print("Error: Travel Itinerary database not found", file=sys.stderr)
return False
# Step 3: Query the database for OSAKA items on Day 1 and Day 2
try:
query_result = notion.databases.query(
database_id=travel_itinerary_db_id,
filter={
"and": [
{"property": "Group", "select": {"equals": "Osaka"}},
{"or": [
{"property": "Day", "select": {"equals": "Day 1"}},
{"property": "Day", "select": {"equals": "Day 2"}}
]}
]
}
)
except Exception as e:
print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr)
return False
# Step 4: Check for items that should have been deleted
six_pm_minutes = 18 * 60 # 6 PM in minutes (18:00)
# Expected deleted items (4 specific items after 6 PM on Day 1 and Day 2)
expected_deleted = {
"Rikuro's Namba Main Branch": {"time": "7 PM", "day": "Day 1", "found": False},
"Shin Sekai \"New World\"": {"time": "8 PM", "day": "Day 2", "found": False},
"Katsudon Chiyomatsu": {"time": "7:30 PM", "day": "Day 2", "found": False},
"Ebisubashi Bridge": {"time": "9 PM", "day": "Day 1", "found": False}
}
# Items that should remain (at or before 6 PM)
expected_remaining = {
"Kuromon Ichiba Market": {"time": "6 PM", "found": False}
}
osaka_items_after_6pm = []
osaka_items_at_or_before_6pm = []
# Debug: Show total query results
print(f"Debug: Found {len(query_result.get('results', []))} total OSAKA items on Day 1 and Day 2")
# Process all OSAKA items on Day 1 and Day 2
for page in query_result.get('results', []):
page_title = get_page_title(page).strip()
page_time = get_page_time(page)
page_group = get_page_group(page)
page_day = get_page_day(page)
if page_group != "Osaka":
continue
# Parse time to check if after 6 PM
time_minutes = parse_time_to_minutes(page_time)
if time_minutes is not None and time_minutes > six_pm_minutes:
osaka_items_after_6pm.append({
"title": page_title,
"time": page_time,
"day": page_day,
"id": page.get('id')
})
# Check if this is one of the expected deleted items
for expected_title, expected_info in expected_deleted.items():
# Clean up the titles for comparison
clean_page_title = page_title.strip().lower()
clean_expected_title = expected_title.strip().lower()
# Check for "Rikuro's" or "Rikuro's" (different apostrophe types)
if "rikuro" in clean_page_title and "rikuro" in clean_expected_title:
title_match = True
elif clean_page_title == clean_expected_title:
title_match = True
elif clean_expected_title in clean_page_title or clean_page_title in clean_expected_title:
title_match = True
else:
title_match = False
if title_match and page_day == expected_info["day"]:
print(f"Debug: Found '{page_title}' on {page_day} at {page_time} - matches expected '{expected_title}'")
expected_deleted[expected_title]["found"] = True
elif time_minutes is not None and time_minutes <= six_pm_minutes:
osaka_items_at_or_before_6pm.append({
"title": page_title,
"time": page_time,
"day": page_day,
"id": page.get('id')
})
# Check if this is one of the expected remaining items
for expected_title in expected_remaining:
if expected_title.lower() in page_title.lower() or page_title.lower() in expected_title.lower():
expected_remaining[expected_title]["found"] = True
# Step 5: Verify results
print(f"\nVerification Summary:")
print(f"=" * 50)
all_passed = True
# Check that the 4 expected items after 6 PM have been deleted
print("\n4 Items that should be deleted (after 6 PM on Day 1 and Day 2):")
for item_name, item_info in expected_deleted.items():
if item_info["found"]:
# If found = True, it means the item still exists (was not deleted)
print(f"✗ {item_name} ({item_info['day']}, {item_info['time']}) - Still exists, should be deleted", file=sys.stderr)
all_passed = False
else:
# If found = False, it means the item was deleted correctly
print(f"✓ {item_name} ({item_info['day']}, {item_info['time']}) - Correctly deleted")
# Check that items at or before 6 PM remain
print("\nItems that should remain (at or before 6 PM on Day 1 and Day 2):")
for item_name, item_info in expected_remaining.items():
if item_info["found"]:
print(f"✓ {item_name} ({item_info['time']}) - Correctly retained")
else:
print(f"✗ {item_name} ({item_info['time']}) - Missing, should not be deleted", file=sys.stderr)
all_passed = False
# Report any items after 6 PM that still exist
if osaka_items_after_6pm:
print(f"\n✗ Found {len(osaka_items_after_6pm)} OSAKA item(s) after 6 PM on Day 1/Day 2:", file=sys.stderr)
for item in osaka_items_after_6pm:
print(f" - {item['title']} at {item['time']} ({item['day']})", file=sys.stderr)
else:
print(f"\n✓ No OSAKA items found after 6 PM on Day 1/Day 2 (all correctly deleted)")
# Report count summary
print(f"\nCount Summary:")
print(f"- OSAKA items after 6 PM on Day 1/Day 2 found: {len(osaka_items_after_6pm)} (should be 0)")
print(f"- OSAKA items at/before 6 PM on Day 1/Day 2 found: {len(osaka_items_at_or_before_6pm)}")
print(f"- Expected deletions verified: {sum(1 for item in expected_deleted.values() if not item['found'])}/4")
return all_passed
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
print("\nVerification passed: All 4 required OSAKA events after 6 PM on Day 1 and Day 2 have been removed")
sys.exit(0)
else:
print("\nVerification failed: Some OSAKA events after 6 PM on Day 1/Day 2 still exist")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/description.md
================================================
Create a comprehensive skills audit system by performing the following tasks:
**Task Requirements:**
1. Create a new database named "Skills Development Tracker" as a child database in the main resume page with the following properties:
- Name (title property)
- Current Skill (relation to Skills database)
- Current Proficiency (rollup from related skill's "Skill Level" property)
- Target Proficiency (number property with format "percent")
- Gap (formula: Target Proficiency - Current Proficiency)
- Learning Resources (rich text property)
- Progress Notes (rich text property)
2. Populate the Skills Development Tracker database with entries for all skills that have a proficiency level below 70% (0.7):
- For each qualifying skill, create an entry with:
- Name: "[Skill Name] Development Plan"
- Link to the corresponding skill in Skills database
- Target Proficiency: Set to Current + 25% (capped at 95%)
- Learning Resources: "Online courses and practice projects"
- Progress Notes: "Initial assessment completed"
================================================
FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/meta.json
================================================
{
"task_id": "simple__skills_development_tracker",
"task_name": "Simple Skills Development Tracker",
"category_id": "online_resume",
"category_name": "Online Resume",
"description": "Create a comprehensive skills audit system with development tracking for skills below 70% proficiency.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"database manipulation",
"cross-reference linking",
"conditional filtering",
"data aggregation",
"template population",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
}
}
================================================
FILE: tasks/notion/easy/online_resume/simple__skills_development_tracker/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Skills Development Tracker database was created correctly.
"""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "New Online Resume")
if not page_id:
print("Error: Page 'New Online Resume' not found.", file=sys.stderr)
return False
# Step 1: Verify Skills Development Tracker database exists
tracker_db_id = notion_utils.find_database_in_block(
notion, page_id, "Skills Development Tracker"
)
if not tracker_db_id:
print(
"Error: Database 'Skills Development Tracker' not found.", file=sys.stderr
)
return False
# Step 2: Verify database schema
try:
db_info = notion.databases.retrieve(database_id=tracker_db_id)
properties = db_info.get("properties", {})
# Check required properties
required_props = {
"Name": "title",
"Current Skill": "relation",
"Current Proficiency": "rollup",
"Target Proficiency": "number",
"Gap": "formula",
"Learning Resources": "rich_text",
"Progress Notes": "rich_text",
}
for prop_name, expected_type in required_props.items():
if prop_name not in properties:
print(
f"Error: Property '{prop_name}' not found in database.",
file=sys.stderr,
)
return False
if properties[prop_name]["type"] != expected_type:
print(
f"Error: Property '{prop_name}' has incorrect type. Expected '{expected_type}', got '{properties[prop_name]['type']}'.",
file=sys.stderr,
)
return False
# Verify Target Proficiency is percent format
if (
properties["Target Proficiency"].get("number", {}).get("format")
!= "percent"
):
print(
"Error: Target Proficiency should have 'percent' format.",
file=sys.stderr,
)
return False
except Exception as e:
print(f"Error retrieving database info: {e}", file=sys.stderr)
return False
# Step 3: Get Skills database to check entries
skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills")
if not skills_db_id:
print("Error: Skills database not found.", file=sys.stderr)
return False
# Get all skills with proficiency < 70%
skills_below_70 = []
try:
skills_results = notion.databases.query(database_id=skills_db_id).get(
"results", []
)
for skill in skills_results:
skill_level = (
skill.get("properties", {}).get("Skill Level", {}).get("number", 1.0)
)
if skill_level < 0.7:
skill_name = (
skill.get("properties", {}).get("Skill", {}).get("title", [])
)
if skill_name:
skill_name_text = skill_name[0].get("text", {}).get("content", "")
skills_below_70.append(
{
"name": skill_name_text,
"id": skill["id"],
"level": skill_level,
}
)
except Exception as e:
print(f"Error querying Skills database: {e}", file=sys.stderr)
return False
if not skills_below_70:
print("Warning: No skills found with proficiency below 70%.", file=sys.stderr)
# This might be OK if all skills are above 70%
# Step 4: Verify entries in Skills Development Tracker
try:
tracker_results = notion.databases.query(database_id=tracker_db_id).get(
"results", []
)
# Check that we have entries for skills below 70%
if len(skills_below_70) > 0 and len(tracker_results) == 0:
print(
"Error: No entries found in Skills Development Tracker database.",
file=sys.stderr,
)
return False
# Verify each entry
for entry in tracker_results:
props = entry.get("properties", {})
# Check name format
name_prop = props.get("Name", {}).get("title", [])
if not name_prop:
print("Error: Entry missing Name property.", file=sys.stderr)
return False
name_text = name_prop[0].get("text", {}).get("content", "")
if not name_text.endswith(" Development Plan"):
print(
f"Error: Entry name '{name_text}' doesn't follow expected format.",
file=sys.stderr,
)
return False
# Check relation to Skills database
skill_relation = props.get("Current Skill", {}).get("relation", [])
if not skill_relation:
print(
f"Error: Entry '{name_text}' missing Current Skill relation.",
file=sys.stderr,
)
return False
# Check Target Proficiency (should be set)
target_prof = props.get("Target Proficiency", {}).get("number")
if target_prof is None:
print(
f"Error: Entry '{name_text}' missing Target Proficiency.",
file=sys.stderr,
)
return False
# Check Learning Resources
learning_resources = props.get("Learning Resources", {}).get(
"rich_text", []
)
if not learning_resources:
print(
f"Error: Entry '{name_text}' missing Learning Resources.",
file=sys.stderr,
)
return False
# Check Progress Notes
progress_notes = props.get("Progress Notes", {}).get("rich_text", [])
if not progress_notes:
print(
f"Error: Entry '{name_text}' missing Progress Notes.",
file=sys.stderr,
)
return False
except Exception as e:
print(f"Error querying Skills Development Tracker: {e}", file=sys.stderr)
return False
print("Success: Skills Development Tracker database verified successfully.")
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/description.md
================================================
# Task: Expert Level Learning Path (Simplified)
## Objective
Extend the Python Roadmap with a new Expert Level chapter, create a bridge lesson, and add two expert lessons that build on existing material.
## Requirements
### 1. Add the Expert Level chapter
- **Database**: Chapters
- **Name**: `Expert Level`
- **Icon**: 🟣 (purple circle emoji)
- Make sure it is linked into the roadmap alongside the existing chapters.
### 2. Create the bridge lesson
Create a lesson that connects advanced material to the new chapter:
- **Title**: `Advanced Foundations Review`
- **Status**: Done
- **Chapter**: Link it to `Expert Level`
- **Parent item**: Link to the lesson whose title contains "Control" (e.g., "Control Flow")
- **Sub-items**: Include links to the lessons containing "Decorators" and "Calling API"
### 3. Add two expert lessons
Add the following entries to the Steps database:
| Lesson Title | Status | Chapter | Parent item | Date |
|--------------|--------|---------|-------------|------|
| `Metaprogramming and AST Manipulation` | To Do | Expert Level | Advanced Foundations Review | 2025-09-15 |
| `Async Concurrency Patterns` | To Do | Expert Level | Calling API | 2025-09-20 |
The lessons must inherit the correct chapter link, parent relationship, and due date as shown above.
================================================
FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/meta.json
================================================
{
"task_id": "expert_level_lessons",
"task_name": "Expert Level Lessons",
"category_id": "python_roadmap",
"category_name": "Python Roadmap",
"description": "Create an Expert Level chapter with sophisticated prerequisite chains and four expert-level lessons.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"database manipulation",
"cross-reference linking",
"conditional filtering",
"status tracking",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap"
}
}
================================================
FILE: tasks/notion/easy/python_roadmap/simple__expert_level_lessons/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
TARGET_PAGE_TITLE = "Python Roadmap"
CHAPTER_NAME = "Expert Level"
CHAPTER_ICON = "🟣"
BRIDGE_TITLE = "Advanced Foundations Review"
REQUIRED_SUBITEM_TITLES = ["Decorators", "Calling API"]
LESSON_REQUIREMENTS = [
{
"title": "Metaprogramming and AST Manipulation",
"status": "To Do",
"date": "2025-09-15",
"parent_title": BRIDGE_TITLE,
},
{
"title": "Async Concurrency Patterns",
"status": "To Do",
"date": "2025-09-20",
"parent_title": "Calling API",
},
]
def _get_database_ids(notion: Client, page_id: str) -> tuple[str | None, str | None]:
"""Return the block IDs for the Chapters and Steps databases on the page."""
chapters_db_id = None
steps_db_id = None
blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
for block in blocks:
if block.get("type") != "child_database":
continue
title = block.get("child_database", {}).get("title", "")
if "Chapters" in title and not chapters_db_id:
chapters_db_id = block["id"]
elif "Steps" in title and not steps_db_id:
steps_db_id = block["id"]
return chapters_db_id, steps_db_id
def _query_step_by_title(notion: Client, database_id: str, title: str, *, exact: bool = True):
"""Return the first step entry matching the given title pattern."""
title_filter = {"equals": title} if exact else {"contains": title}
response = notion.databases.query(
database_id=database_id,
filter={"property": "Lessons", "title": title_filter},
page_size=5,
)
results = response.get("results", [])
return results[0] if results else None
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Verify the simplified Expert Level learning path setup."""
# Resolve the roadmap page.
if main_id:
page_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not page_id or object_type != "page":
print("Error: Python Roadmap page not found.", file=sys.stderr)
return False
else:
page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE)
if not page_id:
print("Error: Python Roadmap page not found.", file=sys.stderr)
return False
# Locate the Chapters and Steps databases.
chapters_db_id, steps_db_id = _get_database_ids(notion, page_id)
if not chapters_db_id:
print("Error: Chapters database not found on the page.", file=sys.stderr)
return False
if not steps_db_id:
print("Error: Steps database not found on the page.", file=sys.stderr)
return False
# Ensure the Expert Level chapter exists with the purple icon.
try:
chapter_resp = notion.databases.query(
database_id=chapters_db_id,
filter={"property": "Name", "title": {"equals": CHAPTER_NAME}},
page_size=1,
)
except Exception as exc:
print(f"Error querying Chapters database: {exc}", file=sys.stderr)
return False
results = chapter_resp.get("results", [])
if not results:
print("Error: Expert Level chapter not found.", file=sys.stderr)
return False
expert_chapter = results[0]
expert_chapter_id = expert_chapter["id"]
icon = expert_chapter.get("icon") or {}
if icon.get("type") != "emoji" or icon.get("emoji") != CHAPTER_ICON:
print("Error: Expert Level chapter must use the purple circle emoji icon.", file=sys.stderr)
return False
print("✓ Expert Level chapter exists with the correct icon.")
# Locate prerequisite lessons (Control Flow, Decorators, Calling API).
control_lesson = _query_step_by_title(notion, steps_db_id, "Control", exact=False)
if not control_lesson:
print("Error: Could not find a lesson containing 'Control' in its title.", file=sys.stderr)
return False
control_lesson_id = control_lesson["id"]
prerequisite_ids = {}
for title in REQUIRED_SUBITEM_TITLES:
lesson = _query_step_by_title(notion, steps_db_id, title, exact=False)
if not lesson:
print(f"Error: Required lesson containing '{title}' not found.", file=sys.stderr)
return False
prerequisite_ids[title] = lesson["id"]
# Verify the bridge lesson.
bridge_lesson = _query_step_by_title(notion, steps_db_id, BRIDGE_TITLE, exact=True)
if not bridge_lesson:
print("Error: Advanced Foundations Review lesson not found.", file=sys.stderr)
return False
status = (bridge_lesson["properties"].get("Status", {}).get("status") or {}).get("name")
if status != "Done":
print("Error: Advanced Foundations Review must have status 'Done'.", file=sys.stderr)
return False
# Ensure chapter relation includes Expert Level.
chapter_rel = bridge_lesson["properties"].get("Chapters", {}).get("relation", [])
if not any(rel["id"] == expert_chapter_id for rel in chapter_rel):
print("Error: Advanced Foundations Review must link to the Expert Level chapter.", file=sys.stderr)
return False
# Parent item should be the control lesson.
parent_rel = bridge_lesson["properties"].get("Parent item", {}).get("relation", [])
if not parent_rel or parent_rel[0]["id"] != control_lesson_id:
print("Error: Advanced Foundations Review should use the control lesson as its Parent item.", file=sys.stderr)
return False
# Sub-items must include the required lessons.
sub_rel = bridge_lesson["properties"].get("Sub-item", {}).get("relation", [])
sub_ids = {rel["id"] for rel in sub_rel}
missing = [title for title, rel_id in prerequisite_ids.items() if rel_id not in sub_ids]
if missing:
print(
f"Error: Advanced Foundations Review must include these lessons as sub-items: {', '.join(missing)}.",
file=sys.stderr,
)
return False
print("✓ Bridge lesson configured with the correct status, chapter, parent, and sub-items.")
# Verify the two expert lessons.
overall_success = True
for spec in LESSON_REQUIREMENTS:
lesson = _query_step_by_title(notion, steps_db_id, spec["title"], exact=True)
if not lesson:
print(f"Error: Lesson '{spec['title']}' not found.", file=sys.stderr)
overall_success = False
continue
lesson_ok = True
# Status check.
status_name = (lesson["properties"].get("Status", {}).get("status") or {}).get("name")
if status_name != spec["status"]:
print(
f"Error: Lesson '{spec['title']}' should have status '{spec['status']}', found '{status_name}'.",
file=sys.stderr,
)
lesson_ok = False
# Chapter relation check.
lesson_chapters = lesson["properties"].get("Chapters", {}).get("relation", [])
if not any(rel["id"] == expert_chapter_id for rel in lesson_chapters):
print(f"Error: Lesson '{spec['title']}' must link to the Expert Level chapter.", file=sys.stderr)
lesson_ok = False
# Parent relation check.
parent_title = spec["parent_title"]
if parent_title == BRIDGE_TITLE:
expected_parent_id = bridge_lesson["id"]
else:
expected_parent_id = prerequisite_ids.get(parent_title)
parent_relation = lesson["properties"].get("Parent item", {}).get("relation", [])
if not expected_parent_id:
print(
f"Error: Could not resolve expected parent '{parent_title}' for lesson '{spec['title']}'.",
file=sys.stderr,
)
lesson_ok = False
else:
if not parent_relation or parent_relation[0]["id"] != expected_parent_id:
print(
f"Error: Lesson '{spec['title']}' should have '{parent_title}' as its Parent item.",
file=sys.stderr,
)
lesson_ok = False
# Date check.
date_prop = lesson["properties"].get("Date", {}).get("date") or {}
if date_prop.get("start") != spec["date"]:
print(
f"Error: Lesson '{spec['title']}' should use date {spec['date']}, found {date_prop.get('start')}.",
file=sys.stderr,
)
lesson_ok = False
if lesson_ok:
print(f"✓ Lesson '{spec['title']}' has the expected properties.")
else:
overall_success = False
if not overall_success:
return False
print("Success: Expert Level chapter, bridge lesson, and expert lessons configured correctly.")
return True
def main() -> None:
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/description.md
================================================
Navigate to the "Self Assessment" page and reorganize the FAQ toggle content to make it easier to scan.
**Task Requirements:**
1. Add a column list with two columns inside the FAQ toggle.
2. Move the first two existing Q&A pairs from the FAQ into the left column.
3. Move the third existing Q&A pair into the right column, keeping the original heading/paragraph formatting.
================================================
FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/meta.json
================================================
{
"task_id": "simple__faq_column_layout",
"task_name": "Simple FAQ Column Layout",
"category_id": "self_assessment",
"category_name": "Self Assessment",
"description": "Reorganize the FAQ section content into a two-column layout with balanced Q&A pairs.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content organization",
"visual formatting",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d",
"stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d"
}
}
================================================
FILE: tasks/notion/easy/self_assessment/simple__faq_column_layout/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the FAQ toggle has been properly reorganized with a column list.
"""
# Start from main_id if provided
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
# Try to find the Self Assessment page
page_id = notion_utils.find_page(notion, "Self Assessment")
if not page_id:
print("Error: Self Assessment page not found.", file=sys.stderr)
return False
# Get all blocks recursively from the page
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# Find the FAQ toggle block
faq_toggle_block = None
faq_toggle_id = None
for block in all_blocks:
if block.get("type") == "toggle":
block_text = notion_utils.get_block_plain_text(block)
if "FAQ" in block_text:
faq_toggle_block = block
faq_toggle_id = block.get("id")
print(f"Found FAQ toggle block: {block_text}")
break
if not faq_toggle_block:
print("Error: FAQ toggle block not found.", file=sys.stderr)
return False
# Find column_list inside the FAQ toggle
column_list_block = None
for block in all_blocks:
if (
block.get("type") == "column_list"
and block.get("parent", {}).get("block_id") == faq_toggle_id
):
column_list_block = block
break
if not column_list_block:
print("Error: No column_list found inside FAQ toggle.", file=sys.stderr)
return False
# Check that there are no Q&A pairs directly under FAQ toggle (outside column_list)
direct_faq_children = []
for block in all_blocks:
if block.get("parent", {}).get("block_id") == faq_toggle_id and block.get(
"id"
) != column_list_block.get("id"):
direct_faq_children.append(block)
# Check if any of these are heading_3 or paragraph blocks (Q&A content)
for block in direct_faq_children:
if block.get("type") in ["heading_3", "paragraph"]:
print(
f"Error: Found Q&A content outside column_list: {notion_utils.get_block_plain_text(block)[:50]}...",
file=sys.stderr,
)
return False
# Find the two columns
columns = []
column_list_id = column_list_block.get("id")
for block in all_blocks:
if (
block.get("type") == "column"
and block.get("parent", {}).get("block_id") == column_list_id
):
columns.append(block)
if len(columns) != 2:
print(f"Error: Expected 2 columns, found {len(columns)}.", file=sys.stderr)
return False
# Count Q&A pairs in each column
qa_counts = []
total_pairs = 0
for i, column in enumerate(columns[:2]):
column_id = column.get("id")
column_blocks = [
block
for block in all_blocks
if block.get("parent", {}).get("block_id") == column_id
]
qa_pairs = 0
j = 0
while j < len(column_blocks):
if (
column_blocks[j].get("type") == "heading_3"
and j + 1 < len(column_blocks)
and column_blocks[j + 1].get("type") == "paragraph"
):
qa_pairs += 1
j += 2
else:
j += 1
qa_counts.append(qa_pairs)
total_pairs += qa_pairs
print(f"Column {i + 1}: Found {qa_pairs} Q&A pairs")
if qa_counts[0] < 2:
print(
f"Error: Left column should contain at least 2 Q&A pairs, found {qa_counts[0]}.",
file=sys.stderr,
)
return False
if qa_counts[1] < 1:
print(
f"Error: Right column should contain at least 1 Q&A pair, found {qa_counts[1]}.",
file=sys.stderr,
)
return False
if total_pairs < 3:
print(
f"Error: Expected at least 3 total Q&A pairs across both columns, found {total_pairs}.",
file=sys.stderr,
)
return False
print(
"Success: FAQ toggle organized with two columns holding the existing Q&A pairs (two on the left, one on the right)."
)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/description.md
================================================
# Task: Reorganize Standard Operating Procedure Page Sections
## Objective
Modify the structure of the Standard Operating Procedure page in Notion by updating the order of two sections.
## Requirements
- Navigate to the Standard Operating Procedure page
- Swap the positions of the "Terminologies" and "Roles & responsibilities" sections
- Preserve all content within each section exactly as is
- Maintain the original formatting and structure of each section
================================================
FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/meta.json
================================================
{
"task_id": "simple__section_organization",
"task_name": "Simple Section Organization",
"category_id": "standard_operating_procedure",
"category_name": "Standard Operating Procedure",
"description": "Reorganize the Standard Operating Procedure page by swapping sections and creating a column layout.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content organization",
"cross-reference linking",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure"
}
}
================================================
FILE: tasks/notion/easy/standard_operating_procedure/simple__section_organization/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
TARGET_PAGE_TITLE = "Standard Operating Procedure"
ROLES_HEADING = "Roles & responsibilities"
TERMINOLOGIES_HEADING = "Terminologies"
def _find_heading_indices(blocks: list[dict]) -> tuple[int | None, int | None]:
"""Return the indices of the target headings within the flattened block list."""
roles_index = None
terminologies_index = None
for index, block in enumerate(blocks):
if block.get("type") != "heading_2":
continue
rich_text = block.get("heading_2", {}).get("rich_text", [])
if not rich_text:
continue
heading_text = rich_text[0].get("text", {}).get("content", "")
if heading_text == ROLES_HEADING and roles_index is None:
roles_index = index
elif heading_text == TERMINOLOGIES_HEADING and terminologies_index is None:
terminologies_index = index
return roles_index, terminologies_index
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Ensure the Roles & responsibilities section appears before Terminologies."""
# Resolve page id.
if main_id:
page_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not page_id or object_type != "page":
print("Error: Standard Operating Procedure page not found.", file=sys.stderr)
return False
else:
page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE)
if not page_id:
print("Error: Standard Operating Procedure page not found.", file=sys.stderr)
return False
# Fetch all blocks (flattened order from top to bottom).
blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
roles_index, terminologies_index = _find_heading_indices(blocks)
if roles_index is None:
print("Error: 'Roles & responsibilities' section not found.", file=sys.stderr)
return False
if terminologies_index is None:
print("Error: 'Terminologies' section not found.", file=sys.stderr)
return False
if roles_index >= terminologies_index:
print(
"Error: Sections are not swapped. 'Roles & responsibilities' should appear before 'Terminologies'.",
file=sys.stderr,
)
return False
print("Success: Section order updated so 'Roles & responsibilities' precedes 'Terminologies'.")
return True
def main() -> None:
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/team_projects/simple__swap_tasks/description.md
================================================
Go to the Team Projects page, find the person responsible for the most tasks (10 in total) and the person responsible for the fewest tasks (3 in total), then swap their assigned tasks.
================================================
FILE: tasks/notion/easy/team_projects/simple__swap_tasks/meta.json
================================================
{
"task_id": "simple__swap_tasks",
"task_name": "Simple Swap Tasks",
"category_id": "team_projects",
"category_name": "Team Projects",
"description": "Find the person responsible for the most and fewest tasks, then swap their assigned tasks.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data aggregation",
"automated migration",
"conditional filtering"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart"
}
}
================================================
FILE: tasks/notion/easy/team_projects/simple__swap_tasks/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the task assignees have been swapped correctly.
Checks:
1. "Develop a plan for promotion" and "Evaluate different third-party services" have swapped assignees
2. The person with most tasks and person with least tasks have swapped all their tasks
"""
# Step 1: Find the Team Projects page
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != 'page':
print("Error: Team Projects page not found.", file=sys.stderr)
return False
else:
# Try to find the page by searching
found_id = notion_utils.find_page(notion, "Team Projects")
if not found_id:
print("Error: Team Projects page not found.", file=sys.stderr)
return False
# Get all blocks from the page to find database references
all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
# Find Tasks database ID from the page
tasks_db_id = None
for block in all_blocks:
if block and block.get("type") == "child_database":
db_title = block.get("child_database", {}).get("title", "")
if "Tasks" in db_title:
tasks_db_id = block["id"]
break
if not tasks_db_id:
print("Error: Tasks database not found.", file=sys.stderr)
return False
print("\n📋 Starting verification...")
# Step 2: Query all tasks to analyze assignees
try:
all_tasks_response = notion.databases.query(
database_id=tasks_db_id,
page_size=100
)
if not all_tasks_response.get("results"):
print("Error: No tasks found in Tasks database.", file=sys.stderr)
return False
tasks = all_tasks_response["results"]
except Exception as e:
print(f"Error querying Tasks database: {e}", file=sys.stderr)
return False
# Step 3: Check specific tasks have swapped assignees
develop_plan_task = None
evaluate_services_task = None
for task in tasks:
task_name = task["properties"]["Name"]["title"][0]["text"]["content"]
if task_name == "Develop a plan for promotion":
develop_plan_task = task
elif task_name == "Evaluate different third-party services":
evaluate_services_task = task
if not develop_plan_task or not evaluate_services_task:
print("Error: Could not find both required tasks.", file=sys.stderr)
return False
# Get assignees for these tasks
develop_plan_assignees = develop_plan_task["properties"]["Assigned"]["people"]
evaluate_services_assignees = evaluate_services_task["properties"]["Assigned"]["people"]
if not develop_plan_assignees or not evaluate_services_assignees:
print("Error: Tasks don't have assignees.", file=sys.stderr)
return False
develop_plan_assignee_id = develop_plan_assignees[0]["id"]
evaluate_services_assignee_id = evaluate_services_assignees[0]["id"]
# These should be different (swapped)
if develop_plan_assignee_id == evaluate_services_assignee_id:
print("Error: Tasks should have different assignees after swap.", file=sys.stderr)
return False
# Step 4: Count tasks per person
task_counts = {}
unassigned_count = 0
for task in tasks:
assignees = task["properties"]["Assigned"]["people"]
if assignees:
assignee_id = assignees[0]["id"]
if assignee_id not in task_counts:
task_counts[assignee_id] = []
task_counts[assignee_id].append(task["properties"]["Name"]["title"][0]["text"]["content"])
else:
unassigned_count += 1
# Sort by task count
sorted_assignees = sorted(task_counts.items(), key=lambda x: len(x[1]))
if len(sorted_assignees) < 2:
print("Error: Need at least 2 people with tasks to verify swap.", file=sys.stderr)
return False
# Get person with least and most tasks
person_with_least = sorted_assignees[0]
person_with_most = sorted_assignees[-1]
least_id, least_tasks = person_with_least
most_id, most_tasks = person_with_most
# Step 5: Verify the swap pattern
# Original distribution (before swap):
# - 5ac96c02-49a4-4320-8de6-b663ba83126b had 3 tasks (least)
# - ac7a3bd0-c111-4464-8f45-8a857a1abc8a had 10 tasks (most)
# After complete swap, we expect:
# - 5ac96c02-49a4-4320-8de6-b663ba83126b should have 10 tasks
# - ac7a3bd0-c111-4464-8f45-8a857a1abc8a should have 3 tasks
original_least_id = "5ac96c02-49a4-4320-8de6-b663ba83126b"
original_most_id = "ac7a3bd0-c111-4464-8f45-8a857a1abc8a"
# Check if the swap has been completed
swap_completed = False
for assignee_id, assignee_tasks in task_counts.items():
if assignee_id == original_least_id and len(assignee_tasks) == 10:
# Person who had 3 now has 10
for other_id, other_tasks in task_counts.items():
if other_id == original_most_id and len(other_tasks) == 3:
# Person who had 10 now has 3
swap_completed = True
break
# Step 6: Summary
print(f"\n📊 Task Distribution:")
print(f" • Total tasks: {len(tasks)}")
print(f" • Assigned tasks: {len(tasks) - unassigned_count}")
print(f" • Unassigned tasks: {unassigned_count}")
print(f" • People with tasks: {len(task_counts)}")
print(f"\n Task counts by person:")
for assignee_id, assignee_tasks in sorted_assignees:
print(f" - {assignee_id[:8]}...: {len(assignee_tasks)} tasks")
# Step 7: Final verification
print("\n🔍 Verification Results:")
# Check that the swap has created a significant difference
if len(most_tasks) - len(least_tasks) < 5:
print(f"Warning: Difference between most and least is only {len(most_tasks) - len(least_tasks)} tasks", file=sys.stderr)
# Verify specific expected outcomes
verification_passed = True
# Check 1: Specific tasks have been swapped
specific_tasks_swapped = develop_plan_assignee_id != evaluate_services_assignee_id
if specific_tasks_swapped:
print(" ✓ Specific tasks have been swapped")
else:
print(" ✗ Specific tasks were not swapped", file=sys.stderr)
verification_passed = False
# Check 2: Task distribution shows a complete swap
if swap_completed:
print(" ✓ Complete task swap verified (3↔10 tasks)")
else:
# Show actual distribution for debugging
person1_tasks = len(task_counts.get(original_least_id, []))
person2_tasks = len(task_counts.get(original_most_id, []))
print(f" ✗ Swap incomplete! Expected 5ac96c02→10 tasks, ac7a3bd0→3 tasks", file=sys.stderr)
print(f" Actual: 5ac96c02→{person1_tasks} tasks, ac7a3bd0→{person2_tasks} tasks", file=sys.stderr)
verification_passed = False
# Check 3: Total task count is preserved
total_assigned_tasks = sum(len(tasks) for _, tasks in task_counts.items())
expected_total = len(tasks) - unassigned_count
if total_assigned_tasks == expected_total:
print(f" ✓ Total task count preserved ({total_assigned_tasks} assigned)")
else:
print(f" ✗ Task count mismatch: {total_assigned_tasks} vs {expected_total} expected", file=sys.stderr)
verification_passed = False
if verification_passed:
print("\n✅ All verification checks passed!")
return True
else:
print("\n❌ Verification failed", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/easy/toronto_guide/simple__change_color/description.md
================================================
Open the **Toronto Guide** page and refresh the colors of the tags in the **Food** database.
## Requirements
1. Find and open the Toronto Guide page in Notion.
2. Locate the *Food* database on that page.
3. Update every tag in the Food database that is currently pink so that it uses a different color of your choice (any non-pink color is fine).
4. Do not modify callouts or tags in the other databases.
================================================
FILE: tasks/notion/easy/toronto_guide/simple__change_color/meta.json
================================================
{
"task_id": "simple__change_color",
"task_name": "Simple Change Color",
"category_id": "toronto_guide",
"category_name": "Toronto Guide",
"description": "Navigate to the Toronto Guide page and change all pink-colored elements to different colors.",
"author": "Xiangyan Liu",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"visual formatting",
"conditional filtering"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide"
}
}
================================================
FILE: tasks/notion/easy/toronto_guide/simple__change_color/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
TARGET_PAGE_TITLE = "Toronto Guide"
FOOD_DATABASE_KEYWORD = "Food"
TARGET_TAG_NAMES = [
"Middle Eastern",
"Jamaican",
"Indian",
]
def _get_food_database_id(notion: Client, page_id: str) -> str | None:
"""Return the block ID of the Food database shown on the target page."""
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
for block in all_blocks:
if not block or block.get("type") != "child_database":
continue
title = block.get("child_database", {}).get("title", "")
if FOOD_DATABASE_KEYWORD.lower() in title.lower():
return block.get("id")
return None
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Check that all target tags in the Food database are no longer pink."""
# Resolve the Toronto Guide page ID.
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != "page":
print("Error: Toronto Guide page not found.", file=sys.stderr)
return False
page_id = found_id
else:
page_id = notion_utils.find_page(notion, TARGET_PAGE_TITLE)
if not page_id:
print("Error: Toronto Guide page not found.", file=sys.stderr)
return False
# Locate the Food database block.
food_db_id = _get_food_database_id(notion, page_id)
if not food_db_id:
print("Error: Food database not found on the Toronto Guide page.", file=sys.stderr)
return False
# Fetch database definition and inspect tag options.
try:
db_info = notion.databases.retrieve(database_id=food_db_id)
except Exception as exc:
print(f"Error: Unable to retrieve Food database ({exc}).", file=sys.stderr)
return False
tags_property = db_info.get("properties", {}).get("Tags", {})
if tags_property.get("type") != "multi_select":
print("Error: Food database does not have a multi-select Tags property.", file=sys.stderr)
return False
options = tags_property.get("multi_select", {}).get("options", [])
remaining_targets = set(TARGET_TAG_NAMES)
failures = False
for option in options:
tag_name = option.get("name", "").strip()
if tag_name not in remaining_targets:
continue
remaining_targets.discard(tag_name)
color = option.get("color")
if color == "pink":
print(f"Error: Tag '{tag_name}' in Food database is still pink.", file=sys.stderr)
failures = True
else:
print(f"✓ Tag '{tag_name}' color updated to '{color}'.")
if remaining_targets:
print(
f"Error: Food tags not found (expected to exist): {sorted(remaining_targets)}.",
file=sys.stderr,
)
return False
if failures:
return False
print("Success: All Food database tags are now non-pink.")
return True
def main() -> None:
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/description.md
================================================
Build an integrated **Employee Onboarding** system for the existing **Company In A Box** page.
**Task Requirements:**
1. Create a new **database** titled **Employee Onboarding Checklist** with the following properties *exactly*:
• **Employee Name** – title
• **Start Date** – date
• **Department** – select (options: Product, Marketing, Sales, HR, Engineering)
Populate this database with **3** sample new-hire pages covering three different departments. Every property in each entry must be filled.
2. Under the top-level page **Company In A Box**, create a new child page titled **Onboarding Hub** containing, in order:
1) The **Employee Onboarding Checklist** database embedded at the top.
2) A section headed **Benefits Overview** that includes linked mentions (@-mentions or link-to-page blocks) to **≥ 3** distinct benefit-policy pages from the **Company Wiki** (for example *Benefits policy*, *Vacation Policy*, *Corporate travel*).
3) A section headed **30-Day Timeline** that presents a numbered list with **7** steps covering the first 30 days. **Each step must reference (via @-mention) an existing page or database**.
4) A section headed **Feedback Form** that provides **≥ 3** to-do items for new hires to check off.
================================================
FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/meta.json
================================================
{
"task_id": "employee_onboarding",
"task_name": "Employee Onboarding",
"category_id": "company_in_a_box",
"category_name": "Company In A Box",
"description": "Build an integrated Employee Onboarding system for the existing Company In A Box page with a checklist database, onboarding hub, and feedback form.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"template population",
"cross-reference linking",
"status tracking"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box"
}
}
================================================
FILE: tasks/notion/standard/company_in_a_box/employee_onboarding/verify.py
================================================
import sys
from typing import Dict, Set
from notion_client import Client
from tasks.utils import notion_utils
def _check_db_schema(db_props: Dict[str, Dict], required: Dict[str, str]) -> bool:
"""Return True if every required property exists with the correct type."""
for prop_name, expected_type in required.items():
if prop_name not in db_props:
print(
f"Error: Property '{prop_name}' missing from database.", file=sys.stderr
)
return False
actual_type = db_props[prop_name]["type"]
if actual_type != expected_type:
print(
f"Error: Property '{prop_name}' has type '{actual_type}', expected '{expected_type}'.",
file=sys.stderr,
)
return False
return True
def verify(notion: Client, main_id: str | None = None) -> bool: # noqa: C901
"""Programmatically verify the onboarding system described in description.md."""
DB_TITLE = "Employee Onboarding Checklist"
HUB_PAGE_TITLE = "Onboarding Hub"
DEPARTMENT_OPTIONS: Set[str] = {
"Product",
"Marketing",
"Sales",
"HR",
"Engineering",
}
REQUIRED_DB_PROPERTIES = {
"Employee Name": "title",
"Start Date": "date",
"Department": "select",
}
# 1. Locate onboarding database
db_id = notion_utils.find_database(notion, DB_TITLE)
if not db_id:
print(f"Error: Database '{DB_TITLE}' not found.", file=sys.stderr)
return False
try:
db_obj = notion.databases.retrieve(database_id=db_id)
except Exception as exc:
print(f"Error retrieving database: {exc}", file=sys.stderr)
return False
db_props = db_obj.get("properties", {})
if not _check_db_schema(db_props, REQUIRED_DB_PROPERTIES):
return False
# Extra: validate select options
dept_options = {opt["name"] for opt in db_props["Department"]["select"]["options"]}
if not DEPARTMENT_OPTIONS.issubset(dept_options):
print(
f"Error: Department select options must include {sorted(DEPARTMENT_OPTIONS)}. Current: {sorted(dept_options)}",
file=sys.stderr,
)
return False
# Check there are at least 3 entries in the database
try:
db_pages = notion.databases.query(database_id=db_id).get("results", [])
except Exception as exc:
print(f"Error querying database: {exc}", file=sys.stderr)
return False
if len(db_pages) < 3:
print(
"Error: Less than 3 onboarding entries found in the database.",
file=sys.stderr,
)
return False
# 2. Locate Onboarding Hub page
hub_page_id = notion_utils.find_page(notion, HUB_PAGE_TITLE)
if not hub_page_id:
print(f"Error: Page '{HUB_PAGE_TITLE}' not found.", file=sys.stderr)
return False
# 3. Ensure the onboarding database is embedded in the hub page
embedded_db_id = notion_utils.find_database_in_block(notion, hub_page_id, DB_TITLE)
if embedded_db_id != db_id:
print(
"Error: The Employee Onboarding Checklist database is not embedded in the Onboarding Hub page.",
file=sys.stderr,
)
return False
# 4. Analyse blocks within the hub page for linked mentions, timeline, and feedback form
all_blocks = notion_utils.get_all_blocks_recursively(notion, hub_page_id)
seen_link_targets: Set[str] = set()
numbered_list_count = 0
todo_count = 0
for blk in all_blocks:
blk_type = blk.get("type")
# Direct link-to-page blocks
if blk_type == "link_to_page":
info = blk.get("link_to_page", {})
target_id = info.get("page_id") or info.get("database_id")
if target_id:
seen_link_targets.add(target_id)
continue
# Rich-text mentions inside content blocks
if blk_type in {
"paragraph",
"numbered_list_item",
"bulleted_list_item",
"to_do",
}:
content = blk.get(blk_type, {})
for rt in content.get("rich_text", []):
if rt.get("type") == "mention":
mention = rt.get("mention", {})
if mention.get("type") in {"page", "database"}:
target_id = mention.get("page", {}).get("id") or mention.get(
"database", {}
).get("id")
if target_id:
seen_link_targets.add(target_id)
# Count numbered list items
if blk_type == "numbered_list_item":
numbered_list_count += 1
# Count to-do items in Feedback Form
if blk_type == "to_do":
todo_count += 1
if len(seen_link_targets) < 3:
print(
"Error: Fewer than 3 linked mentions to benefit policy pages found in the Benefits Overview section.",
file=sys.stderr,
)
return False
if numbered_list_count < 7:
print(
"Error: Numbered list contains fewer than 7 steps in the 30-Day Timeline section.",
file=sys.stderr,
)
return False
if todo_count < 3:
print(
"Error: Feedback Form section contains fewer than 3 to-do items.",
file=sys.stderr,
)
return False
print(
"Success: Verified Employee Onboarding Checklist database, Onboarding Hub page, and all required sections."
)
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/company_in_a_box/goals_restructure/description.md
================================================
Please restructure the **Current Goals** section on my **Company In A Box** page as follows:
1. **Add a new goal heading** — create a new `heading_3` block titled:
`🔄 Digital Transformation Initiative`
2. **Convert all four goal headings to toggles** — the three existing goals
* ⚙️ Expand Operations to LATAM
* 🛠️ Push for Enterprise
* 🩶 Boost Employee Engagement
* 🔄 Digital Transformation Initiative
3. **Move descriptions inside the toggles** — every paragraph or list that originally sat directly under a goal heading should become a **child block** of that heading after it is made toggleable.
4. **Preserve content & order** — apart from the changes above, do **not** modify the text, formatting, or order of existing goal descriptions.
The end result should be a clean **Current Goals** section containing four toggleable goal headings, each with its corresponding details tucked inside.
================================================
FILE: tasks/notion/standard/company_in_a_box/goals_restructure/meta.json
================================================
{
"task_id": "goals_restructure",
"task_name": "Goals Restructure",
"category_id": "company_in_a_box",
"category_name": "Company In A Box",
"description": "Restructure the Current Goals section on the Company In A Box page by adding a new goal heading and converting all goal headings to toggles with content inside.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"content organization",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box"
}
}
================================================
FILE: tasks/notion/standard/company_in_a_box/goals_restructure/verify.py
================================================
import sys
from typing import List
from notion_client import Client
from tasks.utils import notion_utils
# Expected new goal heading text (including emoji)
NEW_GOAL_HEADING = "🔄 Digital Transformation Initiative"
# Section title to look for
GOALS_SECTION_TITLE = "Current Goals"
def _plain(block) -> str:
"""Return concatenated plain text of a block."""
return notion_utils.get_block_plain_text(block)
# Some Notion rich-text strings may include non-breaking spaces (\xa0) after emoji.
# Normalize them to plain spaces so text matching is robust.
def _normalize_string(s: str) -> str:
return s.replace("\xa0", " ")
def _is_heading(block) -> bool:
return block.get("type") in ["heading_1", "heading_2", "heading_3"]
def _is_toggle(block) -> bool:
"""Determine whether a block is a toggle (standard toggle block or toggle-able heading)."""
btype = block.get("type")
# In our scenario, goal blocks are headings (usually heading_3) marked as toggleable.
if btype in ["heading_1", "heading_2", "heading_3"]:
heading_data = block.get(btype, {})
return heading_data.get("is_toggleable", False)
# Some Notion pages may contain classic toggle blocks (type == "toggle"). They are
# not expected in this task, but keeping this check allows broader compatibility.
return btype == "toggle"
def _get_children(notion: Client, block_id: str) -> List[dict]:
"""Retrieve **direct** children of a block (no pagination handling needed for small test pages)."""
try:
return notion.blocks.children.list(block_id=block_id).get("results", [])
except Exception:
return []
def verify(notion: Client, main_id: str = None) -> bool:
"""Verifies that the Company in a Box page has been updated per the task requirements."""
# 1. Locate the main page
page_id = None
if main_id:
found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if found_id and obj_type == "page":
page_id = found_id
if not page_id:
# Try a few case variations just in case
for title in [
"Company In A Box",
]:
page_id = notion_utils.find_page(notion, title)
if page_id:
break
if not page_id:
print("Error: Could not find the 'Company in a Box' page.", file=sys.stderr)
return False
# 2. Recursively locate the "Current Goals" heading and collect its sibling blocks that
# constitute the section.
def _fetch_children(bid: str) -> List[dict]:
try:
return notion.blocks.children.list(block_id=bid).get("results", [])
except Exception:
return []
goals_section_blocks: List[dict] = []
# Breadth-first traversal to find the heading
queue = [page_id]
found_parent = None
found_index = None
while queue and found_parent is None:
parent_id = queue.pop(0)
children = _fetch_children(parent_id)
for idx, child in enumerate(children):
if (
_is_heading(child)
and GOALS_SECTION_TITLE.lower()
in _normalize_string(_plain(child)).lower()
):
found_parent = parent_id
found_index = idx
break
# enqueue grandchildren for further search
for ch in children:
if ch.get("has_children"):
queue.append(ch["id"])
if found_parent is None:
print(
"Error: Could not find the 'Current Goals' heading anywhere in the page.",
file=sys.stderr,
)
return False
# Retrieve siblings once more to get the final list and slice after heading.
siblings = _fetch_children(found_parent)
if found_index is None or found_index >= len(siblings):
print(
"Error: Internal logic issue when locating Current Goals section.",
file=sys.stderr,
)
return False
goals_section_blocks = siblings[found_index + 1 :]
if not goals_section_blocks:
print("Error: 'Current Goals' section appears to be empty.", file=sys.stderr)
return False
# 3. Identify toggle blocks that represent goals
toggle_blocks = [b for b in goals_section_blocks if _is_toggle(b)]
if len(toggle_blocks) != 4:
print(
f"Error: Expected 4 toggle blocks for goals, found {len(toggle_blocks)}.",
file=sys.stderr,
)
return False
# 4. Ensure the new goal heading exists among the toggles
found_new_goal = False
for tb in toggle_blocks:
if (
_normalize_string(NEW_GOAL_HEADING).lower()
in _normalize_string(_plain(tb)).lower()
):
found_new_goal = True
break
if not found_new_goal:
print(
f"Error: Did not find a toggle block with heading '{NEW_GOAL_HEADING}'.",
file=sys.stderr,
)
return False
# 5. Validate that each toggle has at least one child paragraph/description
for tb in toggle_blocks:
if (
_normalize_string(NEW_GOAL_HEADING).lower()
in _normalize_string(_plain(tb)).lower()
):
# Skip checking the new goal itself, as it does not have a description yet.
continue
if not tb.get("has_children", False):
print(
f"Error: Toggle '{_normalize_string(_plain(tb))}' has no child blocks (description not moved).",
file=sys.stderr,
)
return False
children = _get_children(notion, tb["id"])
# Ensure there is at least one content child (paragraph, list item, etc.)
content_types = {
"paragraph",
"bulleted_list_item",
"numbered_list_item",
"to_do",
"callout",
"quote",
}
if not any(c.get("type") in content_types for c in children):
print(
f"Error: Toggle '{_normalize_string(_plain(tb))}' seems to lack any description/content inside it.",
file=sys.stderr,
)
return False
# 6. Confirm that there are **no** residual heading_3 blocks (non-toggle) for the goals
non_toggle_headings = [
b
for b in goals_section_blocks
if b.get("type") == "heading_3" and not _is_toggle(b)
]
if non_toggle_headings:
titles = [_normalize_string(_plain(b)) for b in non_toggle_headings]
print(
f"Error: Found heading_3 blocks that were not converted to toggles: {titles}.",
file=sys.stderr,
)
return False
print(
"Success: Verified goal restructuring with new toggle blocks and descriptions."
)
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/description.md
================================================
Create a quarterly business review dashboard in Notion based on the existing **Company In A Box** workspace.
**Task Requirements:**
1. Inside the **Company Wiki** page you will find a sub-page named **Company Goals**. Extract every departmental objective listed under the four departments — **Product**, **Marketing**, **Sales**, and **HR**.
2. Under the top-level page **Company In A Box**, create a new child page titled **Q4 2024 Business Review Dashboard**.
3. Inside that new page build the following structure (all parts must exist):
1. A single **callout** block near the top that summarises progress toward the three *Current Goals* shown on the main page:
• *LATAM expansion* • *Enterprise push* • *Employee engagement*
(All three phrases must appear in the callout text.)
2. Four separate **section headings** (any heading level) – one for each department (**Product**, **Marketing**, **Sales**, **Human Resources**) – placed below the callout. Under each heading list that department’s objectives in a progress-tracking format (e.g. to-dos, check-box list). Each objective from the **Company Goals** page must appear at least once.
3. Add a **database** named **Action Items** with the following properties *exactly*:
• **Task Name** – title
• **Department** – select (options: Product, Marketing, Sales, HR)
• **Priority** – select (options: High, Medium, Low)
• **Status** – status
Populate this database with **≥ 5** action-item pages derived from the departmental objectives, making sure every field in each entry is filled:
• **Task Name** & **Department** must correctly correspond to the underlying objective/department.
• **Priority** and **Status** can be any allowed value, but they must **not** be left empty.
4. Keep the overall visual style consistent with the existing wiki (use headings, dividers, etc.).
================================================
FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/meta.json
================================================
{
"task_id": "quarterly_review_dashboard",
"task_name": "Quarterly Review Dashboard",
"category_id": "company_in_a_box",
"category_name": "Company In A Box",
"description": "Create a quarterly business review dashboard in Notion based on the existing Company In A Box workspace with department objectives and action items database.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"data aggregation",
"report generation",
"status tracking",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Company-In-A-Box-23d81626b6d7800098f3d0e64a706cd8",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/company-in-a-box"
}
}
================================================
FILE: tasks/notion/standard/company_in_a_box/quarterly_review_dashboard/verify.py
================================================
import sys
from typing import List
from notion_client import Client
from tasks.utils import notion_utils
def _contains_keywords(text: str, keywords: List[str]) -> bool:
lowered = text.lower()
return all(kw.lower() in lowered for kw in keywords)
def verify(notion: Client, main_id: str = None) -> bool:
"""Programmatically verify that the dashboard page and its contents meet the
requirements described in description.md.
"""
DASHBOARD_TITLE = "Q4 2024 Business Review Dashboard"
PARENT_PAGE_TITLE = "Company In A Box"
CALL_OUT_KEYWORDS = ["latam", "enterprise", "employee engagement"]
DEPARTMENTS = ["Product", "Marketing", "Sales", "Human Resources"]
REQUIRED_DB_PROPERTIES = {
"Task Name": "title",
"Department": "select",
"Priority": "select",
"Status": "status",
}
PRIORITY_OPTIONS = {"High", "Medium", "Low"}
# 1. Locate the dashboard page
page_id = None
if main_id:
found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if found_id and obj_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, DASHBOARD_TITLE)
if not page_id:
print(f"Error: Page '{DASHBOARD_TITLE}' not found.", file=sys.stderr)
return False
# Optional: ensure it is a child of Company In A Box
try:
page_obj = notion.pages.retrieve(page_id=page_id)
parent_id = page_obj.get("parent", {}).get("page_id")
if parent_id:
parent_page = notion.pages.retrieve(page_id=parent_id)
parent_title_rt = (
parent_page.get("properties", {}).get("title", {}).get("title", [])
)
parent_title = (
parent_title_rt[0].get("plain_text") if parent_title_rt else None
)
if parent_title != PARENT_PAGE_TITLE:
print(
f"Error: Dashboard page is not a direct child of '{PARENT_PAGE_TITLE}'.",
file=sys.stderr,
)
return False
except Exception:
pass # parent check is best-effort only
# 2. Verify callout with keywords
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
callout_ok = False
for block in all_blocks:
if block.get("type") == "callout":
callout_text = notion_utils.get_block_plain_text(block)
if _contains_keywords(callout_text, CALL_OUT_KEYWORDS):
callout_ok = True
break
if not callout_ok:
print(
"Error: No callout found that includes all three Current Goal keywords (LATAM, Enterprise, Employee engagement).",
file=sys.stderr,
)
return False
# 3. Verify department section headings
found_depts = set()
for block in all_blocks:
if block.get("type") in {"heading_1", "heading_2", "heading_3"}:
heading_text = notion_utils.get_block_plain_text(block)
for dept in DEPARTMENTS:
if dept.lower() in heading_text.lower():
found_depts.add(dept)
if set(DEPARTMENTS) != found_depts:
missing = set(DEPARTMENTS) - found_depts
print(
f"Error: Missing department headings: {', '.join(missing)}.",
file=sys.stderr,
)
return False
# 4. Verify Action Items database exists and has correct schema
db_id = notion_utils.find_database_in_block(notion, page_id, "Action Items")
if not db_id:
print(
"Error: Database 'Action Items' not found on the dashboard.",
file=sys.stderr,
)
return False
try:
db = notion.databases.retrieve(database_id=db_id)
except Exception as exc:
print(f"Error: Unable to retrieve database: {exc}", file=sys.stderr)
return False
db_props = db.get("properties", {})
for prop_name, expected_type in REQUIRED_DB_PROPERTIES.items():
if prop_name not in db_props:
print(
f"Error: Property '{prop_name}' missing from database.", file=sys.stderr
)
return False
actual_type = db_props[prop_name]["type"]
if isinstance(expected_type, list):
if actual_type not in expected_type:
print(
f"Error: Property '{prop_name}' has type '{actual_type}', expected one of {expected_type}.",
file=sys.stderr,
)
return False
else:
if actual_type != expected_type:
print(
f"Error: Property '{prop_name}' has type '{actual_type}', expected '{expected_type}'.",
file=sys.stderr,
)
return False
# Extra check for Priority options
if prop_name == "Priority":
options = {opt["name"] for opt in db_props[prop_name]["select"]["options"]}
if not PRIORITY_OPTIONS.issubset(options):
print(
f"Error: Priority property options must include High/Medium/Low. Current options: {options}",
file=sys.stderr,
)
return False
# 5. Verify at least 5 action items exist
try:
pages = notion.databases.query(database_id=db_id).get("results", [])
except Exception as exc:
print(f"Error querying database pages: {exc}", file=sys.stderr)
return False
if len(pages) < 5:
print("Error: Database contains fewer than 5 action items.", file=sys.stderr)
return False
# Optional: Verify Department values valid
for page in pages:
props = page.get("properties", {})
# Task Name must be non-empty
title_rt = props.get("Task Name", {}).get("title", [])
task_name = title_rt[0].get("plain_text") if title_rt else ""
if not task_name.strip():
print(
f"Error: Action item '{page.get('id')}' is missing a Task Name.",
file=sys.stderr,
)
return False
# Department must be valid
dept_select = props.get("Department", {}).get("select", {}).get("name")
if not dept_select or dept_select not in DEPARTMENTS:
print(
f"Error: Action item '{page.get('id')}' has invalid or missing Department value.",
file=sys.stderr,
)
return False
# Priority and Status must be set (any value)
priority_val = props.get("Priority", {}).get("select", {}).get("name")
status_val = props.get("Status", {}).get("status", {}).get("name")
if not priority_val or not status_val:
print(
f"Error: Action item '{page.get('id')}' must have both Priority and Status set.",
file=sys.stderr,
)
return False
print(
"Success: Verified Business Review Dashboard, departmental sections, callout, and Action Items database with ≥5 entries."
)
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/description.md
================================================
Find the page named "Computer Science Student Dashboard" and add a new Go column to the "Code Snippets" section.
**Task Requirements:**
1. In the "Code Snippets" section, create (or locate) a column dedicated to the Go programming language. **This column must appear between the existing Python and JavaScript columns** within the same column list.
2. At the top of the Go column, add a bold paragraph that contains exactly the text `Go`.
3. Under the header paragraph, add three code-block blocks configured with `language` set to **go**:
a. **Basic Go program** – Caption must be `Basic Go program` and the code content must be exactly:
```go
package main
import "fmt"
func main() {
fmt.Println("Hello, World!")
}
```
b. **For loop in Go** – Caption must be `For loop in Go` and the code content must be exactly:
```go
for i := 0; i < 5; i++ {
fmt.Println(i)
}
```
c. **Function definition in Go** – Caption must be `Function definition in Go` and the code content must be exactly:
```go
func add(a, b int) int {
return a + b
}
```
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/meta.json
================================================
{
"task_id": "code_snippets_go",
"task_name": "Code Snippets Go",
"category_id": "computer_science_student_dashboard",
"category_name": "Computer Science Student Dashboard",
"description": "Add a new Go column to the Code Snippets section between Python and JavaScript columns.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"content organization",
"visual formatting",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
}
}
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/code_snippets_go/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
# Expected code blocks (language=go)
EXPECTED_CODE_BLOCKS = [
{
"caption": "Basic Go program",
"code": (
'package main\n\nimport "fmt"\n\nfunc main() {\n fmt.Println("Hello, World!")\n}'
),
},
{
"caption": "For loop in Go",
"code": ("for i := 0; i < 5; i++ {\n fmt.Println(i)\n}"),
},
{
"caption": "Function definition in Go",
"code": ("func add(a, b int) int {\n return a + b\n}"),
},
]
HEADER_TEXT = "Go"
def _normalize(text: str) -> str:
"""Remove trailing spaces on each line and strip leading/trailing blank lines."""
return "\n".join(line.rstrip() for line in text.strip().splitlines())
def _find_page(notion: Client, main_id: str | None) -> str | None:
"""Return a page_id to verify against or None if not found."""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard")
return page_id
def _has_bold_header_text(block, text: str) -> bool:
"""Generic bold header/paragraph check for a given text."""
block_type = block.get("type")
if block_type not in {"paragraph", "heading_1", "heading_2", "heading_3"}:
return False
rich_text_list = block.get(block_type, {}).get("rich_text", [])
if not rich_text_list:
return False
plain = "".join(rt.get("plain_text", "") for rt in rich_text_list).strip()
if plain != text:
return False
return any(rt.get("annotations", {}).get("bold", False) for rt in rich_text_list)
def _go_column_order_correct(notion: Client, page_id: str) -> bool:
"""Return True if there exists a column list where Python → Go → JavaScript order holds."""
# Gather all blocks once (flat list) to locate column_list blocks
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
column_list_ids = [
blk["id"] for blk in all_blocks if blk.get("type") == "column_list"
]
for cl_id in column_list_ids:
# Retrieve columns in explicit order
columns = notion.blocks.children.list(block_id=cl_id).get("results", [])
header_to_idx: dict[str, int] = {}
for idx, col in enumerate(columns):
# Recursively inspect blocks within this column
col_blocks = notion_utils.get_all_blocks_recursively(notion, col["id"])
for blk in col_blocks:
if _has_bold_header_text(blk, "Python"):
header_to_idx.setdefault("Python", idx)
elif _has_bold_header_text(blk, "Go"):
header_to_idx.setdefault("Go", idx)
elif _has_bold_header_text(blk, "JavaScript"):
header_to_idx.setdefault("JavaScript", idx)
# Short-circuit if all three found within current traversal
if len(header_to_idx) == 3:
break
if (
"Python" in header_to_idx
and "Go" in header_to_idx
and "JavaScript" in header_to_idx
and header_to_idx["Python"]
< header_to_idx["Go"]
< header_to_idx["JavaScript"]
):
return True
return False
def _collect_code_blocks(blocks):
"""Return list of (code_content, caption) tuples for code blocks with language 'go'."""
collected = []
for block in blocks:
if block.get("type") != "code":
continue
code_data = block.get("code", {})
if code_data.get("language") != "go":
continue
code_plain = "".join(
rt.get("plain_text", "") for rt in code_data.get("rich_text", [])
)
caption_plain = "".join(
rt.get("plain_text", "") for rt in code_data.get("caption", [])
)
collected.append((code_plain, caption_plain))
return collected
def verify(notion: Client, main_id: str | None = None) -> bool:
page_id = _find_page(notion, main_id)
if not page_id:
print("Error: Target page not found.", file=sys.stderr)
return False
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# Verify header
header_ok = any(_has_bold_header_text(b, HEADER_TEXT) for b in all_blocks)
if not header_ok:
print("Failure: Bold header 'Go' not found.", file=sys.stderr)
return False
# Verify code blocks
code_blocks_found = _collect_code_blocks(all_blocks)
remaining = EXPECTED_CODE_BLOCKS.copy()
for code, caption in code_blocks_found:
norm_code = _normalize(code)
for expected in remaining:
if (
_normalize(expected["code"]) == norm_code
and expected["caption"] == caption
):
remaining.remove(expected)
break
if remaining:
missing = ", ".join(exp["caption"] for exp in remaining)
print(
f"Failure: Missing or incorrect Go code blocks: {missing}", file=sys.stderr
)
return False
# Verify column order Python → Go → JavaScript
if not _go_column_order_correct(notion, page_id):
print(
"Failure: Go column is not positioned between Python and JavaScript.",
file=sys.stderr,
)
return False
print(
"Success: Verified Go column with required code blocks and correct positioning."
)
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
sys.exit(0 if verify(notion, main_id) else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/description.md
================================================
Your goal is to connect the `Courses` and `Internship search` databases inside the **Computer Science Student Dashboard** page and populate them with sample data that can be verified automatically.
**Task Requirements:**
1. In the **Courses** database, add a new **relation** property named **Related Internships** that points to the **Internship search** database.
2. Ensure the relation is **bidirectional** by adding a relation property in the **Internship search** database named **Relevant Courses** that points back to the **Courses** database.
3. Create **exactly three** new pages in the **Courses** database with realistic computer-science course data. Each course page must include **all** of the following properties and values:
• **Code** (text) – unique codes `CS301`, `CS302`, and `CS303` respectively
• **Name** (text) – pick appropriate names (e.g., *Computer Networks*, *Operating Systems*, *Machine Learning*)
• **Credit** (number) – any positive integer
• **Status** (status) – choose from `Planned`, `In Progress`, or `Completed`
• **Related Internships** (relation) – link to at least one internship created in step4.
4. Create **exactly two** new pages in the **Internship search** database with complete application information. Each internship page must include **all** of the following properties and values:
• **Company** (text) – `OpenAI` and `Google` respectively
• **Role** (text) – `Machine Learning Intern` and `Software Engineering Intern`
• **Status** (status) – set to `Interested`
• **Relevant Courses** (relation) – link to one or more of the courses created in step3.
5. Every course created in step3 must be linked to at least one internship from step4 **and** every internship must be linked back to at least one course.
The task is considered complete when the relation properties exist, the specified course and internship pages are present with the exact values above, and the relations correctly connect the two databases in both directions.
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/meta.json
================================================
{
"task_id": "courses_internships_relation",
"task_name": "Courses Internships Relation",
"category_id": "computer_science_student_dashboard",
"category_name": "Computer Science Student Dashboard",
"description": "Connect the Courses and Internship search databases with bidirectional relations and populate with sample data.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"cross-reference linking",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
}
}
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/courses_internships_relation/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
# ---------------------------------------------------------------------------
# Constants -----------------------------------------------------------------
# ---------------------------------------------------------------------------
MAIN_PAGE_TITLE = "Computer Science Student Dashboard"
COURSES_DB_TITLE = "Courses"
INTERNSHIP_DB_TITLE = "Internship search"
COURSE_CODES = {"CS301", "CS302", "CS303"}
COURSE_RELATION_NAME = "Related Internships"
INTERNSHIP_RELATION_NAME = "Relevant Courses"
INTERNSHIP_COMPANIES = {"OpenAI", "Google"}
# ---------------------------------------------------------------------------
# Helper functions -----------------------------------------------------------
# ---------------------------------------------------------------------------
def _locate_main_page(notion: Client, main_id: str | None) -> str | None:
"""Return the page_id of the dashboard page or None if not found."""
page_id = None
if main_id:
found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if found_id and obj_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, MAIN_PAGE_TITLE)
return page_id
def _locate_database(notion: Client, parent_page_id: str, db_title: str) -> str | None:
"""Recursively search for a child database by title and return its id."""
return notion_utils.find_database_in_block(notion, parent_page_id, db_title)
# ---------------------------------------------------------------------------
# Verification logic ---------------------------------------------------------
# ---------------------------------------------------------------------------
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Verify completion of the Courses ↔ Internship relation task."""
# ------------------------------------------------------------------
# Locate main page and databases -----------------------------------
# ------------------------------------------------------------------
page_id = _locate_main_page(notion, main_id)
if not page_id:
print(f"Error: Page '{MAIN_PAGE_TITLE}' not found.", file=sys.stderr)
return False
courses_db_id = _locate_database(notion, page_id, COURSES_DB_TITLE)
internships_db_id = _locate_database(notion, page_id, INTERNSHIP_DB_TITLE)
if not courses_db_id:
print(f"Error: Database '{COURSES_DB_TITLE}' not found.", file=sys.stderr)
return False
if not internships_db_id:
print(f"Error: Database '{INTERNSHIP_DB_TITLE}' not found.", file=sys.stderr)
return False
# ------------------------------------------------------------------
# Validate relation properties -------------------------------------
# ------------------------------------------------------------------
courses_db_obj = notion.databases.retrieve(database_id=courses_db_id)
internships_db_obj = notion.databases.retrieve(database_id=internships_db_id)
courses_props = courses_db_obj.get("properties", {})
internships_props = internships_db_obj.get("properties", {})
# Courses → Internships relation
if COURSE_RELATION_NAME not in courses_props:
print(
f"Error: Property '{COURSE_RELATION_NAME}' missing in Courses database.",
file=sys.stderr,
)
return False
course_rel_prop = courses_props[COURSE_RELATION_NAME]
if (
course_rel_prop.get("type") != "relation"
or course_rel_prop["relation"].get("database_id") != internships_db_id
):
print(
"Error: Courses relation property is not configured correctly.",
file=sys.stderr,
)
return False
# Internships → Courses relation
if INTERNSHIP_RELATION_NAME not in internships_props:
print(
f"Error: Property '{INTERNSHIP_RELATION_NAME}' missing in Internship search database.",
file=sys.stderr,
)
return False
intern_rel_prop = internships_props[INTERNSHIP_RELATION_NAME]
if (
intern_rel_prop.get("type") != "relation"
or intern_rel_prop["relation"].get("database_id") != courses_db_id
):
print(
"Error: Internship relation property is not configured correctly.",
file=sys.stderr,
)
return False
# ------------------------------------------------------------------
# Validate course pages --------------------------------------------
# ------------------------------------------------------------------
course_pages = notion.databases.query(database_id=courses_db_id).get("results", [])
valid_course_count = 0
course_page_id_set = set()
internship_ids_seen: set[str] = set()
for page in course_pages:
props = page.get("properties", {})
code_rts = props.get("Code", {}).get("rich_text", [])
code_val = "".join(rt.get("plain_text", "") for rt in code_rts).strip()
if code_val not in COURSE_CODES:
continue # not one of the new course entries we care about
# Check required scalar props
title_rts = props.get("Name", {}).get("title", [])
name_ok = bool("".join(rt.get("plain_text", "") for rt in title_rts).strip())
credits_ok = props.get("Credit", {}).get("number") is not None
status_name = props.get("Status", {}).get("status", {}).get("name", "")
status_allowed = {"planned", "in progress", "completed"}
status_ok = status_name.lower() in status_allowed
# Relation must point to at least one internship
relations = props.get(COURSE_RELATION_NAME, {}).get("relation", [])
if not (name_ok and credits_ok and status_ok and relations):
print(
f"Error: Course '{code_val}' is missing required property values or relations, or wrong values.",
file=sys.stderr,
)
return False
# Collect IDs for further mutual check
course_page_id_set.add(page["id"])
internship_ids_seen.update(rel["id"] for rel in relations)
valid_course_count += 1
if valid_course_count != 3:
print(
f"Error: Expected exactly 3 new course pages with codes {COURSE_CODES}, found {valid_course_count}.",
file=sys.stderr,
)
return False
# ------------------------------------------------------------------
# Validate internship pages ----------------------------------------
# ------------------------------------------------------------------
internship_pages = notion.databases.query(database_id=internships_db_id).get(
"results", []
)
valid_intern_count = 0
internship_page_ids = set()
course_ids_seen_from_intern: set[str] = set()
for page in internship_pages:
props = page.get("properties", {})
company_rts = props.get("Company", {}).get("rich_text", [])
company = "".join(rt.get("plain_text", "") for rt in company_rts).strip()
if company not in INTERNSHIP_COMPANIES:
continue # not one of the two new internships
role_rts = props.get("Role", {}).get("title", [])
role_ok = bool("".join(rt.get("plain_text", "") for rt in role_rts).strip())
status_name = props.get("Status", {}).get("status", {}).get("name", "")
status_ok = status_name.lower() == "interested"
relations = props.get(INTERNSHIP_RELATION_NAME, {}).get("relation", [])
if not (role_ok and status_ok and relations):
print(
f"Error: Internship at '{company}' is missing required property values or relations, or wrong values.",
file=sys.stderr,
)
return False
internship_page_ids.add(page["id"])
course_ids_seen_from_intern.update(rel["id"] for rel in relations)
valid_intern_count += 1
if valid_intern_count != 2:
print(
f"Error: Expected exactly 2 new internship pages for companies {INTERNSHIP_COMPANIES}, found {valid_intern_count}.",
file=sys.stderr,
)
return False
# ------------------------------------------------------------------
# Mutual relation consistency --------------------------------------
# ------------------------------------------------------------------
# Each relation from courses should point to one of the two internships identified
if not internship_ids_seen.issubset(internship_page_ids):
print(
"Error: Some course relations point to pages outside the expected internships.",
file=sys.stderr,
)
return False
# Each relation from internships should point back to the three course pages identified
if not course_ids_seen_from_intern.issubset(course_page_id_set):
print(
"Error: Some internship relations point to pages outside the expected courses.",
file=sys.stderr,
)
return False
print(
"Success: Verified bidirectional relations, course and internship entries as required."
)
return True
# ---------------------------------------------------------------------------
# CLI entry-point -----------------------------------------------------------
# ---------------------------------------------------------------------------
def main() -> None:
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
sys.exit(0 if verify(notion, main_id) else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/description.md
================================================
Your goal is to create a new study-session entry in the **Computer Science Student Dashboard** page.
1. Locate the ☑️ Habit tracker section of the page.
2. **Insert a new date section** immediately **after the existing `2022-09-02` to-do items but *before* the divider block** that follows them. Make sure the new date has proper formatting with a date mention and bold styling like the existing dates, and all to-do items should be unchecked initially. The new section should be inserted right after the 2022-09-02 to-do items but before the divider.
3. Directly **beneath** this new date mention, add **exactly four unchecked to-do blocks** with the following plain text (including the leading emoji on each line):
• 🧠 Review algorithms for technical interview
• 📚 Study database systems chapter 7
• ⚡ Practice system design problems
• 🎯 Complete data structures assignment
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/meta.json
================================================
{
"task_id": "study_session_tracker",
"task_name": "Study Session Tracker",
"category_id": "computer_science_student_dashboard",
"category_name": "Computer Science Student Dashboard",
"description": "Create a new study-session entry in the Habit tracker section with four unchecked to-do items.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"content organization",
"visual formatting",
"status tracking"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Computer-Science-Student-Dashboard-23e81626b6d78083b787d3c832b02ef4",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/computer-science-student-dashboard"
}
}
================================================
FILE: tasks/notion/standard/computer_science_student_dashboard/study_session_tracker/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
from typing import Dict
def _normalize_string(s: str) -> str:
"""Replace non-breaking space with regular space for safe comparison."""
return s.replace("\xa0", " ")
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Verify that the new study-session entry for 2025-01-29 was added correctly.
The script checks that:
1. A bold date-mention with start=2025-01-29 exists.
2. The mention sits after the 2022-09-02 section but before the divider that originally
followed that section.
3. Exactly four specified to-do items follow the new date mention and they are all unchecked.
"""
# ---------------------------------------------------------------------
# Locate the main page -------------------------------------------------
# ---------------------------------------------------------------------
page_id: str | None = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Computer Science Student Dashboard")
if not page_id:
print(
"Error: Page 'Computer Science Student Dashboard' not found.",
file=sys.stderr,
)
return False
# ---------------------------------------------------------------------
# Fetch all blocks under the page (flattened order) --------------------
# ---------------------------------------------------------------------
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# ---------------------------------------------------------------------
# Locate reference blocks ---------------------------------------------
# ---------------------------------------------------------------------
TARGET_DATE = "2025-01-29"
PREVIOUS_DATE = "2022-09-02"
index_previous_date: int | None = None
index_new_date: int | None = None
index_divider_after_previous: int | None = None
for idx, block in enumerate(all_blocks):
# Divider detection (we care only about the first divider that appears after
# the 2022-09-02 block)
if block.get("type") == "divider":
if index_previous_date is not None and index_divider_after_previous is None:
index_divider_after_previous = idx
# We only need to inspect paragraph blocks that contain a date mention
if block.get("type") != "paragraph":
continue
rich_text_list = block["paragraph"].get("rich_text", [])
for rt in rich_text_list:
if (
rt.get("type") != "mention"
or rt.get("mention", {}).get("type") != "date"
):
continue
date_start = rt["mention"]["date"].get("start")
if date_start == PREVIOUS_DATE and index_previous_date is None:
index_previous_date = idx
if date_start == TARGET_DATE and index_new_date is None:
index_new_date = idx
# (1) Verify bold annotation
if not rt.get("annotations", {}).get("bold", False):
print(
"Error: The 2025-01-29 date mention is not bold.",
file=sys.stderr,
)
return False
# Ensure all reference indices were found
if index_previous_date is None:
print("Error: Could not locate the 2022-09-02 date section.", file=sys.stderr)
return False
if index_divider_after_previous is None:
print(
"Error: Could not locate the divider that follows the 2022-09-02 section.",
file=sys.stderr,
)
return False
if index_new_date is None:
print(
"Error: Could not locate the new 2025-01-29 date mention.", file=sys.stderr
)
return False
# (2) Verify ordering
if not (index_previous_date < index_new_date < index_divider_after_previous):
print(
"Error: The 2025-01-29 section is positioned incorrectly.", file=sys.stderr
)
return False
# ---------------------------------------------------------------------
# Verify to-do items under the new date section ------------------------
# ---------------------------------------------------------------------
expected_texts = [
"🧠 Review algorithms for technical interview",
"📚 Study database systems chapter 7",
"⚡ Practice system design problems",
"🎯 Complete data structures assignment",
]
expected_todos: Dict[str, bool] = {
_normalize_string(t): False for t in expected_texts
}
# Look through the blocks that lie between the new date mention and the divider
for block in all_blocks[index_new_date + 1 : index_divider_after_previous]:
if block.get("type") != "to_do":
# Any non to-do block inside this range indicates mis-placement.
# We simply ignore it – correctness is determined by presence of required to-dos.
continue
plain_text = notion_utils.get_block_plain_text(block).strip()
plain_text_norm = _normalize_string(plain_text)
if plain_text_norm in expected_todos:
# (3a) Verify the to-do is unchecked
if block["to_do"].get("checked", False):
print(f"Error: To-do '{plain_text}' is checked.", file=sys.stderr)
return False
expected_todos[plain_text_norm] = True
missing_items = [text for text, found in expected_todos.items() if not found]
if missing_items:
print(f"Error: Missing to-do items: {missing_items}", file=sys.stderr)
return False
# ---------------------------------------------------------------------
# Success --------------------------------------------------------------
# ---------------------------------------------------------------------
print("Success: Study session for 2025-01-29 added correctly.")
return True
# -------------------------------------------------------------------------
# Command-line entry-point -------------------------------------------------
# -------------------------------------------------------------------------
def main() -> None:
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/description.md
================================================
Please restructure the **IT Inventory** database as described below. Your automation will be checked by an automated script, so follow every detail exactly.
---
Task Steps
1. Inside the **IT Trouble Shooting Hub** page, locate the database named **IT Inventory**.
2. Query this database and collect every page whose **Status** property is **Expired** or **To be returned**.
3. Create a **new full-page database** directly under the same IT Trouble Shooting Hub page called **IT Asset Retirement Queue**.
4. Configure this new database so that it contains **exactly** the following properties (spellings and types must match):
• Serial – title
• Tags – multi_select
• Status – select
• Vendor – select
• Expiration date – date
• Retirement Reason – select with option set { **Expired License**, **Hardware Obsolete**, **Security Risk**, **User Offboarding** }
5. For every inventory item gathered in step2:
a. Create a corresponding page in **IT Asset Retirement Queue** and copy over the values of the Serial, Tags, Status, Vendor and Expiration date properties.
b. Set **Retirement Reason** to one of the four options above (choose the most appropriate).
c. Archive the original inventory page **after** the new page has been created.
6. After all items are migrated:
a. Update the **description** of the **IT Asset Retirement Queue** database so it is **exactly** `AUTO-GENERATED MIGRATION COMPLETED` (no additional text).
b. Create a new page under **IT Trouble Shooting Hub** titled **Retirement Migration Log**. Inside this page, add a **callout block** whose text follows the exact pattern:
`Successfully migrated assets to the retirement queue on 2025-03-24.`
• `` is the total number of items moved.
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/meta.json
================================================
{
"task_id": "asset_retirement_migration",
"task_name": "Asset Retirement Migration",
"category_id": "it_trouble_shooting_hub",
"category_name": "IT Trouble Shooting Hub",
"description": "Restructure the IT Inventory database by migrating expired assets to a new IT Asset Retirement Queue database.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"automated migration",
"conditional filtering",
"data aggregation",
"report generation"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
}
}
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/asset_retirement_migration/verify.py
================================================
import sys
from typing import Dict, Set
from notion_client import Client
from tasks.utils import notion_utils
def _get_database(root_page_id: str, notion: Client, name: str) -> str | None:
"""Helper that finds a child database by title inside a page."""
return notion_utils.find_database_in_block(notion, root_page_id, name)
def _check_property(props: Dict, name: str, expected_type: str) -> bool:
if name not in props:
print(f"Error: Property '{name}' missing in database.", file=sys.stderr)
return False
if props[name]["type"] != expected_type:
print(
f"Error: Property '{name}' expected type '{expected_type}', found '{props[name]['type']}'.",
file=sys.stderr,
)
return False
return True
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Verifies that the IT Asset Retirement Queue was created and populated correctly."""
# -------------------------------------------------------------------------
# Resolve the root IT Trouble Shooting Hub page
# -------------------------------------------------------------------------
root_page_id = None
if main_id:
found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if found_id and obj_type == "page":
root_page_id = found_id
if not root_page_id:
root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub")
if not root_page_id:
print(
"Error: Could not locate the 'IT Trouble Shooting Hub' page.",
file=sys.stderr,
)
return False
# -------------------------------------------------------------------------
# Locate the original and new databases
# -------------------------------------------------------------------------
inventory_db_id = _get_database(root_page_id, notion, "IT Inventory")
if not inventory_db_id:
print("Error: 'IT Inventory' database not found.", file=sys.stderr)
return False
retirement_db_id = _get_database(root_page_id, notion, "IT Asset Retirement Queue")
if not retirement_db_id:
print("Error: 'IT Asset Retirement Queue' database not found.", file=sys.stderr)
return False
# -------------------------------------------------------------------------
# Validate schema of the retirement queue database
# -------------------------------------------------------------------------
retirement_db = notion.databases.retrieve(database_id=retirement_db_id)
r_props = retirement_db["properties"]
required_schema = {
"Serial": "title",
"Tags": "multi_select",
"Status": "select",
"Vendor": "select",
"Expiration date": "date",
"Retirement Reason": "select",
}
for pname, ptype in required_schema.items():
if not _check_property(r_props, pname, ptype):
return False
# Check Retirement Reason options
expected_reason_options: Set[str] = {
"Expired License",
"Hardware Obsolete",
"Security Risk",
"User Offboarding",
}
actual_options = {
opt["name"] for opt in r_props["Retirement Reason"]["select"]["options"]
}
if actual_options != expected_reason_options:
print(
"Error: 'Retirement Reason' select options mismatch.\n"
f"Expected: {sorted(expected_reason_options)}\n"
f"Found: {sorted(actual_options)}",
file=sys.stderr,
)
return False
# ---------------------------------------------------------------
# Validate database description starts with required phrase
# ---------------------------------------------------------------
desc_rich = retirement_db.get("description", [])
desc_text = "".join([t.get("plain_text", "") for t in desc_rich])
required_desc = "AUTO-GENERATED MIGRATION COMPLETED"
if desc_text.strip() != required_desc:
print(
f"Error: Retirement database description must be exactly '{required_desc}'.",
file=sys.stderr,
)
return False
# -------------------------------------------------------------------------
# Validate that inventory items are moved & archived
# -------------------------------------------------------------------------
expired_filter = {
"property": "Status",
"select": {"equals": "Expired"},
}
to_return_filter = {
"property": "Status",
"select": {"equals": "To be returned"},
}
compound_filter = {"or": [expired_filter, to_return_filter]}
# Query for any *active* items that still match these statuses
remaining_items = notion.databases.query(
database_id=inventory_db_id,
filter=compound_filter,
archived=False,
).get("results", [])
if remaining_items:
print(
f"Error: {len(remaining_items)} 'Expired' / 'To be returned' items still present in IT Inventory.",
file=sys.stderr,
)
return False
# There should be at least one entry in the retirement queue
retirement_pages = notion.databases.query(database_id=retirement_db_id).get(
"results", []
)
expected_serials = {"65XYQ/GB", "36x10PIQ"}
if len(retirement_pages) != len(expected_serials):
print(
f"Error: Expected {len(expected_serials)} retirement pages, found {len(retirement_pages)}.",
file=sys.stderr,
)
return False
# Each retirement page must have a Retirement Reason
serials_seen = set()
for page in retirement_pages:
props = page["properties"]
reason = props.get("Retirement Reason", {}).get("select", {})
if not reason or reason.get("name") not in expected_reason_options:
print(
f"Error: Page {page['id']} missing valid 'Retirement Reason'.",
file=sys.stderr,
)
return False
# Collect Serial title
title_rich = props.get("Serial", {}).get("title", [])
serial_val = "".join([t.get("plain_text", "") for t in title_rich]).strip()
serials_seen.add(serial_val)
if serials_seen != expected_serials:
print(
f"Error: Serial values mismatch. Expected {sorted(expected_serials)}, found {sorted(serials_seen)}.",
file=sys.stderr,
)
return False
# -----------------------------------------------------------------
# Verify the migration log page and callout block contents
# -----------------------------------------------------------------
log_page_title = "Retirement Migration Log"
log_page_id = notion_utils.find_page(notion, log_page_title)
if not log_page_id:
print(f"Error: Page '{log_page_title}' not found.", file=sys.stderr)
return False
# Search for a callout block with required pattern
import re
callout_pattern = re.compile(
r"Successfully migrated (\d+) assets to the retirement queue on 2025-03-24\."
)
blocks = notion_utils.get_all_blocks_recursively(notion, log_page_id)
match_found = False
for blk in blocks:
if blk.get("type") == "callout":
text = notion_utils.get_block_plain_text(blk)
m = callout_pattern.search(text)
if m:
migrated_num = int(m.group(1))
if migrated_num == len(expected_serials):
match_found = True
else:
print(
f"Error: Callout reports {migrated_num} assets, but {len(retirement_pages)} retirement pages found.",
file=sys.stderr,
)
return False
break
if not match_found:
print(
"Error: Required callout block not found in migration log page.",
file=sys.stderr,
)
return False
print("Success: All verification criteria satisfied.")
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/description.md
================================================
Please help me create a comprehensive security audit ticket based on the data already stored in the **IT Trouble Shooting Hub** page.
Your automation should:
1. In the **IT Inventory** database, find every item whose **Expiration date** is **before 2023-07-15**.
2. In the **IT FAQs** database, look up any FAQ entries that have the **"Security"** tag.
3. **Create a new page** inside the **IT Requests** database with **exact title**:
`Quarterly Security Audit - Expired Assets Review`
4. Set its **Priority** property to **High**.
5. Set its **Due** property to **2023-06-22**.
6. In the page body, add a bullet-list block that enumerates **each expired inventory item**. **Each bullet item must follow this exact text format (including the dashes):**
` - - `
• `` is the item’s Serial value.
• `` is the first tag assigned to the inventory item (e.g., "Laptop").
• `` is a brief action you suggest based on the security FAQ entry (any text is acceptable).
Example (do **not** copy):
`ABC123 - Laptop - Renew warranty and enable disk encryption`
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/meta.json
================================================
{
"task_id": "security_audit_ticket",
"task_name": "Security Audit Ticket",
"category_id": "it_trouble_shooting_hub",
"category_name": "IT Trouble Shooting Hub",
"description": "Create a comprehensive security audit ticket based on expired inventory items and security FAQ entries.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"conditional filtering",
"database manipulation",
"data aggregation",
"report generation"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
}
}
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/security_audit_ticket/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
import re
def _get_title_text(page_properties: dict) -> str:
"""Extract the plain text of the first title property from a page."""
for prop in page_properties.values():
if prop.get("type") == "title":
title_rich = prop.get("title", [])
if title_rich:
return title_rich[0].get("plain_text")
return ""
def verify(notion: Client, main_id: str | None = None) -> bool:
"""Verify that the automation created the expected security audit ticket."""
# ----------------------------------------------------------------------------------
# Locate the root page (IT Trouble Shooting Hub) either via main_id or by title.
# ----------------------------------------------------------------------------------
root_page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
root_page_id = found_id
if not root_page_id:
root_page_id = notion_utils.find_page(notion, "IT Trouble Shooting Hub")
if not root_page_id:
print(
"Error: Could not locate the 'IT Trouble Shooting Hub' page.",
file=sys.stderr,
)
return False
# ----------------------------------------------------------------------------------
# Find the IT Requests database under the root page.
# ----------------------------------------------------------------------------------
requests_db_id = notion_utils.find_database_in_block(
notion, root_page_id, "IT Requests"
)
if not requests_db_id:
print(
"Error: 'IT Requests' database not found in the workspace.", file=sys.stderr
)
return False
# ----------------------------------------------------------------------------------
# Search for the expected ticket inside the IT Requests database.
# ----------------------------------------------------------------------------------
expected_title = "Quarterly Security Audit - Expired Assets Review"
results = notion.databases.query(database_id=requests_db_id).get("results", [])
target_page = None
for page in results:
title_text = _get_title_text(page.get("properties", {}))
if title_text == expected_title:
target_page = page
break
if not target_page:
print(
f"Failure: Ticket with title '{expected_title}' was not found in 'IT Requests' database.",
file=sys.stderr,
)
return False
props = target_page.get("properties", {})
# ----------------------------------------------------------------------------------
# Validate Priority property.
# ----------------------------------------------------------------------------------
priority_value = props.get("Priority", {}).get("select", {}).get("name")
if priority_value != "High":
print(
f"Failure: Expected Priority 'High', found '{priority_value}'.",
file=sys.stderr,
)
return False
# ----------------------------------------------------------------------------------
# Validate Due date property.
# ----------------------------------------------------------------------------------
due_date_start = props.get("Due", {}).get("date", {}).get("start")
expected_due_iso = "2023-06-22"
if not due_date_start or not due_date_start.startswith(expected_due_iso):
print(
f"Failure: Expected Due date '{expected_due_iso}', found '{due_date_start}'.",
file=sys.stderr,
)
return False
# ----------------------------------------------------------------------------------
# Validate the bulleted list contains the correct expired items in required format.
# ----------------------------------------------------------------------------------
page_id = target_page["id"]
blocks = notion.blocks.children.list(block_id=page_id).get("results", [])
bullet_texts = [
notion_utils.get_block_plain_text(b)
for b in blocks
if b.get("type") == "bulleted_list_item"
]
expected_items = {
"192371-8910/54": "Computer Accessory",
"32x11PIP": "Computer Accessory",
"76x87PCY": "Laptop",
"36x10PIQ": "Computer Accessory",
"65XYQ/GB": "License",
}
if len(bullet_texts) != len(expected_items):
print(
f"Failure: Expected {len(expected_items)} bullet items, found {len(bullet_texts)}.",
file=sys.stderr,
)
return False
bullet_pattern = re.compile(r"^\s*(.*?)\s+-\s+(.*?)\s+-\s+(.+?)\s*$")
matched = set()
for text in bullet_texts:
m = bullet_pattern.match(text)
if not m:
print(
f"Failure: Bullet item '{text}' does not follow ' - - ' format.",
file=sys.stderr,
)
return False
serial, tag, advice = m.group(1).strip(), m.group(2).strip(), m.group(3).strip()
if serial not in expected_items:
print(
f"Failure: Unexpected Serial '{serial}' found in bullet list.",
file=sys.stderr,
)
return False
if expected_items[serial] != tag:
print(
f"Failure: Serial '{serial}' expected tag '{expected_items[serial]}', found '{tag}'.",
file=sys.stderr,
)
return False
if not advice:
print(
f"Failure: Bullet item for Serial '{serial}' is missing a recommendation/advice.",
file=sys.stderr,
)
return False
matched.add(serial)
if len(matched) != len(expected_items):
missing = set(expected_items.keys()) - matched
print(
f"Failure: Missing bullet items for serials: {', '.join(missing)}.",
file=sys.stderr,
)
return False
print("Success: All verification criteria satisfied.")
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/description.md
================================================
**Task Overview**
My IT knowledge base contains pages whose verification status has expired:
**Task Requirements**
1. Locate the database named **"IT Homepage"** inside the main page **"It Trouble Shooting Hub"**.
2. Within that database, find every page (except for **"It Inventory"**) where the **Verification** property state contains `expired`.
3. For **each** expired page:
• Insert a **callout block** at the very top (as the first child block) whose rich-text content is:
`VERIFICATION EXPIRED - This page needs review and re-verification`
• Set the callout’s icon to ⚠️.
• Set the callout’s colour to `red_background`.
4. Create a new entry in the **"IT Requests"** database with:
• Title (property **Task name**) **exactly** `Batch Verification Update Required`.
• **Priority** set to `High`.
• **Status** set to `In progress`.
• In the page body add a **bulleted list** where each bullet is a **mention** of the page processed in step 3 (i.e., use the Notion mention object linking to that page).
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/meta.json
================================================
{
"task_id": "verification_expired_update",
"task_name": "Verification Expired Update",
"category_id": "it_trouble_shooting_hub",
"category_name": "IT Trouble Shooting Hub",
"description": "Update pages with expired verification status by adding warning callouts and creating a batch update request.",
"author": "Zijian Wu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"conditional filtering",
"visual formatting",
"database manipulation",
"cross-reference linking",
"status tracking"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/It-Trouble-Shooting-Hub-23e81626b6d78020aba7eb65ae1cc2d5",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/it-trouble-shooting-hub"
}
}
================================================
FILE: tasks/notion/standard/it_trouble_shooting_hub/verification_expired_update/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
CALL_OUT_TEXT = "VERIFICATION EXPIRED - This page needs review and re-verification"
CALL_OUT_ICON = "⚠️"
CALL_OUT_COLOR = "red_background"
IT_HOMEPAGE_DB_TITLE = "IT Homepage"
IT_REQUESTS_DB_TITLE = "IT Requests"
REQUEST_TITLE = "Batch Verification Update Required"
PRIORITY_HIGH = "High"
STATUS_IN_PROGRESS = "In progress"
def _get_main_page_id(notion: Client, main_id: str | None) -> str | None:
"""Resolve the main page id starting from CLI arg or by title search."""
if main_id:
found_id, obj_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if found_id and obj_type == "page":
return found_id
# Fallback to title search (case-insensitive)
return notion_utils.find_page(notion, "It Trouble Shooting Hub")
def _fetch_database_id(
notion: Client, parent_page_id: str, db_title: str
) -> str | None:
"""Locate a child database by title inside a given page."""
return notion_utils.find_database_in_block(notion, parent_page_id, db_title)
def _expired_pages(notion: Client, db_id: str) -> list[dict]:
"""Return list of page objects with Verification.state == 'expired'."""
# Query all pages (API max 100 per call). If many pages expected, iterate.
results = notion.databases.query(database_id=db_id).get("results", [])
expired = []
for page in results:
verification_prop = page.get("properties", {}).get("Verification", {})
state = verification_prop.get("verification", {}).get("state")
# Skip the IT Inventory database entry
title_prop = page.get("properties", {}).get("Page", {}).get("title", [])
title_text = title_prop[0].get("plain_text") if title_prop else ""
if title_text.strip().lower() == "it inventory":
continue
if state and "expired" in state.lower():
expired.append(page)
return expired
def _check_callout_present(notion: Client, page_id: str) -> bool:
"""Verify the specified callout is the first child block of the page."""
children = notion.blocks.children.list(block_id=page_id, page_size=1).get(
"results", []
)
if not children:
return False
first_block = children[0]
if first_block.get("type") != "callout":
return False
data = first_block.get("callout", {})
# Check color
if data.get("color") != CALL_OUT_COLOR:
return False
# Check icon
icon = data.get("icon", {})
if icon.get("type") != "emoji" or icon.get("emoji") != CALL_OUT_ICON:
return False
# Check text content (callout rich text plain text)
plain_text = notion_utils.get_block_plain_text(first_block)
return CALL_OUT_TEXT in plain_text
def _find_request_page(notion: Client, db_id: str) -> dict | None:
"""Find the IT Request page with the expected title."""
# Use a simple search inside database
res = notion.databases.query(
database_id=db_id,
filter={"property": "Task name", "title": {"equals": REQUEST_TITLE}},
).get("results", [])
return res[0] if res else None
def _check_request_properties(page: dict) -> bool:
props = page.get("properties", {})
priority = props.get("Priority", {}).get("select", {}).get("name")
status = (
props.get("Status", {}).get("status", {}).get("name")
if props.get("Status", {}).get("status")
else props.get("Status", {}).get("select", {}).get("name")
)
return priority == PRIORITY_HIGH and status == STATUS_IN_PROGRESS
def _request_page_contains_mentions(
notion: Client, request_page_id: str, expected_page_ids: list[str]
) -> bool:
children = notion.blocks.children.list(block_id=request_page_id, page_size=100).get(
"results", []
)
bullet_blocks = [b for b in children if b.get("type") == "bulleted_list_item"]
mentioned_ids: set[str] = set()
for block in bullet_blocks:
rich_text = block.get("bulleted_list_item", {}).get("rich_text", [])
for rt in rich_text:
if rt.get("type") == "mention":
mention = rt.get("mention", {})
if mention.get("type") == "page":
mentioned_ids.add(mention.get("page", {}).get("id"))
if len(mentioned_ids) < len(expected_page_ids):
return False
return all(pid in mentioned_ids for pid in expected_page_ids)
def verify(notion: Client, main_id: str | None = None) -> bool:
main_page_id = _get_main_page_id(notion, main_id)
if not main_page_id:
print(
"Error: Could not locate the main page 'It Trouble Shooting Hub'.",
file=sys.stderr,
)
return False
# Locate required databases
it_home_db_id = _fetch_database_id(notion, main_page_id, IT_HOMEPAGE_DB_TITLE)
it_req_db_id = _fetch_database_id(notion, main_page_id, IT_REQUESTS_DB_TITLE)
if not all([it_home_db_id, it_req_db_id]):
print(
"Error: Required databases not found under the main page.", file=sys.stderr
)
return False
# Identify expired pages
expired_pages = _expired_pages(notion, it_home_db_id)
if not expired_pages:
print(
"Failure: No expired pages found; expected at least one for this task.",
file=sys.stderr,
)
return False
# Verify callout on each expired page
for pg in expired_pages:
pid = pg["id"]
if not _check_callout_present(notion, pid):
print(
f"Failure: Callout missing or incorrect on page {pid}.", file=sys.stderr
)
return False
# Verify IT Request entry
request_page = _find_request_page(notion, it_req_db_id)
if not request_page:
print(
"Failure: IT Request 'Batch Verification Update Required' not found.",
file=sys.stderr,
)
return False
if not _check_request_properties(request_page):
print("Failure: Priority or Status incorrect on IT Request.", file=sys.stderr)
return False
# Verify bullet list in IT Request body
expired_titles = []
for p in expired_pages:
title_prop = p.get("properties", {}).get("Page", {}).get("title", [])
title_text = title_prop[0].get("plain_text") if title_prop else None
if title_text:
expired_titles.append(title_text)
expected_page_ids = [p["id"] for p in expired_pages]
if not _request_page_contains_mentions(
notion, request_page["id"], expected_page_ids
):
print(
"Failure: IT Request body does not contain mentions for all affected pages.",
file=sys.stderr,
)
return False
print("Success: All verification checks passed.")
return True
def main():
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/description.md
================================================
Create a comprehensive daily itinerary overview page to help organize my Japan travel plans. I need you to create a new page called 'Daily Itinerary Overview' as a child of the main Japan Travel Planner page.
**Task Requirements:**
1. Create a new page titled 'Daily Itinerary Overview' as a child page of the main Japan Travel Planner page
2. Query the Travel Itinerary database to retrieve all activities
3. Structure the page with the following specific format:
- Add a heading_1 block with text "📅 Daily Itinerary Overview"
- Add a heading_2 block with text "📊 Trip Summary"
- Under Trip Summary, add a paragraph listing the total number of visited activities
- Create heading_2 blocks for "🌅 Day 1", "🌆 Day 2", and "🌃 Day 3"
- Under each day heading, list the activities scheduled for that day in to do list
- Each activity (use To-do list) should show: Activity Name - City (if available), for example, "Osaka Castle - Osaka". Check it if it's visited.
4. The summary paragraph must contain the exact text "Total activities visited (from Day 1 to Day 3): [NUMBER]" where [NUMBER] is the actual count.
5. Ensure all headings use the exact emoji and text format specified above
================================================
FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/meta.json
================================================
{
"task_id": "daily_itinerary_overview",
"task_name": "Daily Itinerary Overview",
"category_id": "japan_travel_planner",
"category_name": "Japan Travel Planner",
"description": "Create a comprehensive daily itinerary overview page to organize Japan travel plans with structured day-by-day activities.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"data aggregation",
"report generation",
"visual formatting",
"status tracking"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe"
}
}
================================================
FILE: tasks/notion/standard/japan_travel_planner/daily_itinerary_overview/verify.py
================================================
import sys
import re
from notion_client import Client
from tasks.utils import notion_utils
def verify_todo_database_correspondence(all_blocks, activities_by_day, _):
"""
Verify that to-do items in the overview page correspond exactly to database activities.
"""
# Extract to-do items organized by day from the overview page
todos_by_day = {"Day 1": [], "Day 2": [], "Day 3": []}
current_day = None
checked_todos_count = 0
for block in all_blocks:
block_type = block.get("type")
block_text = notion_utils.get_block_plain_text(block)
# Track which day section we're in
if block_type == "heading_2":
if "🌅 Day 1" in block_text:
current_day = "Day 1"
elif "🌆 Day 2" in block_text:
current_day = "Day 2"
elif "🌃 Day 3" in block_text:
current_day = "Day 3"
else:
current_day = None # Reset for non-day headings
# Collect to-do items under day headings
elif block_type == "to_do" and current_day:
to_do_data = block.get("to_do", {})
is_checked = to_do_data.get("checked", False)
if is_checked:
checked_todos_count += 1
todos_by_day[current_day].append(
{"text": block_text, "checked": is_checked}
)
# Verify each day's activities match
for day in ["Day 1", "Day 2", "Day 3"]:
db_activities = activities_by_day[day]
page_todos = todos_by_day[day]
# Check if counts match
if len(db_activities) != len(page_todos):
print(
f"Error: {day} activity count mismatch. Database has {len(db_activities)} activities, page has {len(page_todos)} to-dos.",
file=sys.stderr,
)
return False
# Verify each database activity has corresponding to-do
for db_activity in db_activities:
expected_format = f"{db_activity['name']}"
if db_activity["city"]:
expected_format += f" - {db_activity['city']}"
# Find matching to-do item
matching_todo = None
for todo in page_todos:
if (
expected_format in todo["text"]
or db_activity["name"] in todo["text"]
):
matching_todo = todo
break
if not matching_todo:
print(
f"Error: {day} - Database activity '{expected_format}' not found in to-do list.",
file=sys.stderr,
)
return False
# Verify checked status matches visited status
if db_activity["visited"] != matching_todo["checked"]:
status_desc = "checked" if db_activity["visited"] else "unchecked"
actual_desc = "checked" if matching_todo["checked"] else "unchecked"
print(
f"Error: {day} - Activity '{db_activity['name']}' should be {status_desc} but is {actual_desc}.",
file=sys.stderr,
)
return False
# Verify summary count matches checked to-dos
for block in all_blocks:
if block.get("type") == "paragraph":
block_text = notion_utils.get_block_plain_text(block)
if "Total activities visited (from Day 1 to Day 3): 8" in block_text:
print(
f"Success: Daily Itinerary Overview page created with correct structure. All {checked_todos_count} visited activities match database."
)
return True
print(
f"Error: Summary shows incorrect visited activity count. Expected: {checked_todos_count} (based on checked to-do items)",
file=sys.stderr,
)
return False
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Daily Itinerary Overview page has been created correctly.
"""
# Find the main Japan Travel Planner page
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Japan Travel Planner")
if not page_id:
print("Error: Main 'Japan Travel Planner' page not found.", file=sys.stderr)
return False
# Find the Daily Itinerary Overview child page
overview_page_id = None
try:
# Get all child pages of the main page
response = notion.search(
query="Daily Itinerary Overview",
filter={"property": "object", "value": "page"},
)
for result in response.get("results", []):
# Check if this page is a child of the main page
parent = result.get("parent", {})
if parent.get("type") == "page_id" and parent.get("page_id") == page_id:
overview_page_id = result["id"]
break
if not overview_page_id:
# Alternative method: check page title directly
for result in response.get("results", []):
title_list = (
result.get("properties", {}).get("title", {}).get("title", [])
)
for title_obj in title_list:
if "Daily Itinerary Overview" in title_obj.get("plain_text", ""):
overview_page_id = result["id"]
break
if overview_page_id:
break
except Exception as e:
print(
f"Error searching for Daily Itinerary Overview page: {e}", file=sys.stderr
)
return False
if not overview_page_id:
print(
"Error: 'Daily Itinerary Overview' page not found as child of main page.",
file=sys.stderr,
)
return False
# Get all blocks from the overview page
all_blocks = notion_utils.get_all_blocks_recursively(notion, overview_page_id)
# Required content to verify - must appear in this exact order
required_headings_sequence = [
("📅 Daily Itinerary Overview", "heading_1"),
("📊 Trip Summary", "heading_2"),
("🌅 Day 1", "heading_2"),
("🌆 Day 2", "heading_2"),
("🌃 Day 3", "heading_2"),
]
found_headings_in_order = []
found_summary = False
summary_has_correct_format = False
found_todo_items = False
# Check each block and track heading sequence
for block in all_blocks:
block_text = notion_utils.get_block_plain_text(block)
block_type = block.get("type")
# Check for required headings in sequence
for heading_text, expected_type in required_headings_sequence:
if heading_text in block_text and block_type == expected_type:
found_headings_in_order.append((heading_text, expected_type))
# Check for trip summary paragraph
if (
block_type == "paragraph"
and "Total activities visited (from Day 1 to Day 3):" in block_text
):
found_summary = True
# Check if the format is correct (contains a number)
if re.search(
r"Total activities visited \(from Day 1 to Day 3\):\s*\d+", block_text
):
summary_has_correct_format = True
# Check for to-do list items (activities under day headings)
if block_type == "to_do":
found_todo_items = True
# Check if to-do items follow the format "Activity Name - City"
if " - " in block_text:
# Format appears to be correct (contains dash separator)
pass
# Verify all required headings are found in correct sequence
if len(found_headings_in_order) != len(required_headings_sequence):
missing_headings = []
for heading_text, heading_type in required_headings_sequence:
if (heading_text, heading_type) not in found_headings_in_order:
missing_headings.append(f"{heading_text} ({heading_type})")
print(f"Error: Missing required headings: {missing_headings}", file=sys.stderr)
return False
# Verify headings appear in correct order
for i, (found_heading, found_type) in enumerate(found_headings_in_order):
expected_heading, expected_type = required_headings_sequence[i]
if found_heading != expected_heading or found_type != expected_type:
print(
f"Error: Headings not in correct order. Expected '{expected_heading}' ({expected_type}) at position {i + 1}, but found '{found_heading}' ({found_type})",
file=sys.stderr,
)
return False
# Verify trip summary exists and has correct format
if not found_summary:
print(
"Error: Trip summary paragraph with 'Total activities visite' not found.",
file=sys.stderr,
)
return False
if not summary_has_correct_format:
print(
"Error: Trip summary does not have correct format 'Total activities visited: [NUMBER]'.",
file=sys.stderr,
)
return False
# Verify to-do list items exist (activities should be in to-do format)
if not found_todo_items:
print(
"Error: No to-do list items found. Activities should be listed as to-do items under day headings.",
file=sys.stderr,
)
return False
# Additional verification: Check if Travel Itinerary database exists and has data
try:
itinerary_db_id = notion_utils.find_database_in_block(
notion, page_id, "Travel Itinerary"
)
if not itinerary_db_id:
itinerary_db_id = notion_utils.find_database(notion, "Travel Itinerary")
if itinerary_db_id:
# Query the database to get all activities
db_response = notion.databases.query(database_id=itinerary_db_id)
db_activities = db_response.get("results", [])
# Organize database activities by day
activities_by_day = {"Day 1": [], "Day 2": [], "Day 3": []}
visited_count = 0
for result in db_activities:
properties = result.get("properties", {})
# Extract activity info
activity_info = {"name": "", "city": "", "visited": False, "day": None}
for prop_name, prop_value in properties.items():
prop_type = prop_value.get("type")
# Get activity name (usually from title property)
if prop_type == "title" and prop_value.get("title"):
activity_info["name"] = prop_value["title"][0]["plain_text"]
# Get city info
elif "city" in prop_name.lower() and prop_type in [
"rich_text",
"select",
]:
if prop_type == "rich_text" and prop_value.get("rich_text"):
activity_info["city"] = prop_value["rich_text"][0][
"plain_text"
]
elif prop_type == "select" and prop_value.get("select"):
activity_info["city"] = prop_value["select"]["name"]
# Get visited status
elif prop_type == "checkbox":
if prop_value.get("checkbox"):
activity_info["visited"] = True
visited_count += 1
# Get day info
elif "day" in prop_name.lower() and prop_type in [
"select",
"rich_text",
]:
if prop_type == "select" and prop_value.get("select"):
day_value = prop_value["select"]["name"]
if day_value in activities_by_day:
activity_info["day"] = day_value
elif prop_type == "rich_text" and prop_value.get("rich_text"):
day_value = prop_value["rich_text"][0]["plain_text"]
if day_value in activities_by_day:
activity_info["day"] = day_value
# Add to appropriate day if day is specified
if activity_info["day"] and activity_info["name"]:
activities_by_day[activity_info["day"]].append(activity_info)
# Now verify to-do items match database activities
return verify_todo_database_correspondence(
all_blocks, activities_by_day, visited_count
)
else:
print(
"Warning: Travel Itinerary database not found, using to-do items for count verification."
)
# Count checked to-do items in the overview page even without database
checked_todos_count = 0
for block in all_blocks:
if block.get("type") == "to_do":
to_do_data = block.get("to_do", {})
if to_do_data.get("checked", False):
checked_todos_count += 1
# Verify the summary shows the correct visited count based on checked to-dos
for block in all_blocks:
if block.get("type") == "paragraph":
block_text = notion_utils.get_block_plain_text(block)
if f"Total activities visited: {checked_todos_count}" in block_text:
print(
f"Success: Daily Itinerary Overview page created with correct structure and {checked_todos_count} visited activities."
)
return True
print(
f"Error: Summary shows incorrect visited activity count. Expected: {checked_todos_count} (based on checked to-do items)",
file=sys.stderr,
)
return False
except Exception as e:
print(f"Warning: Could not verify activity count: {e}")
print("Success: Daily Itinerary Overview page created with correct structure.")
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/description.md
================================================
I'm preparing for my Japan trip and need to organize my packing list. Please help me:
**Step 1: Update Items in the Packing List Database**
In the Clothes category, all items have already been packed except for the hat After this, check the `SIM Card` entry and the `Wallet` entry.
**Step 2: Create Packing Progress Summary**
After adding the items, create a new section in the main Japan Travel Planner page immediately after the "Packing List 💼" heading. This section should contain:
1. A paragraph block with the bold text "**Packing Progress Summary**"
2. Followed by bullet list items showing statistics for each category in the format:
- "Category: X/Y packed" (where X is packed items, Y is total items), for example: "Shoes: 2/10 packed"
- ...
================================================
FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/meta.json
================================================
{
"task_id": "packing_progress_summary",
"task_name": "Packing Progress Summary",
"category_id": "japan_travel_planner",
"category_name": "Japan Travel Planner",
"description": "Update packing list items and create a progress summary section showing statistics for each category.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"data aggregation",
"report generation",
"status tracking"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
}
}
================================================
FILE: tasks/notion/standard/japan_travel_planner/packing_progress_summary/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that:
1. All Clothes items except hat are marked as packed
2. SIM Card and Wallet entries are checked
3. Packing Progress Summary section is created with statistics
"""
# Find the main Japan Travel Planner page
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Japan Travel Planner")
if not page_id:
print("Error: Page 'Japan Travel Planner' not found.", file=sys.stderr)
return False
# Find the Packing List database
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
packing_list_db_id = None
packing_list_heading_id = None
for i, block in enumerate(all_blocks):
# Find the Packing List heading
if block.get("type") == "heading_2":
heading_text = notion_utils.get_block_plain_text(block)
if "Packing List" in heading_text and "💼" in heading_text:
packing_list_heading_id = block["id"]
# Look for the database after this heading
for j in range(i + 1, len(all_blocks)):
if all_blocks[j].get("type") == "child_database":
packing_list_db_id = all_blocks[j]["id"]
break
break
if not packing_list_db_id:
print("Error: Packing List database not found.", file=sys.stderr)
return False
# Query the database for all items
try:
db_items = notion.databases.query(database_id=packing_list_db_id)
# Track items for verification
clothes_items = []
sim_card_found = False
sim_card_packed = False
wallet_found = False
wallet_packed = False
# Process all items
for page in db_items.get("results", []):
props = page.get("properties", {})
# Get item name
name_prop = props.get("Name", {})
if name_prop.get("type") == "title":
name = "".join(
[t.get("plain_text", "") for t in name_prop.get("title", [])]
)
else:
continue
# Get type (multi_select)
type_prop = props.get("Type", {})
types = []
if type_prop.get("type") == "multi_select":
types = [
opt.get("name", "") for opt in type_prop.get("multi_select", [])
]
# Get packed status
packed_prop = props.get("Packed", {})
packed = False
if packed_prop.get("type") == "checkbox":
packed = packed_prop.get("checkbox", False)
# Check specific items
if name == "SIM Card":
sim_card_found = True
sim_card_packed = packed
elif name == "Wallet":
wallet_found = True
wallet_packed = packed
# Track Clothes items
if "Clothes" in types:
clothes_items.append(
{"name": name, "packed": packed, "is_hat": "hat" in name.lower()}
)
# Verify Clothes items (all packed except hat)
for item in clothes_items:
if item["is_hat"]:
if item["packed"]:
print(
"Error: Hat should not be packed but is marked as packed.",
file=sys.stderr,
)
return False
else:
if not item["packed"]:
print(
f"Error: Clothes item '{item['name']}' should be packed but is not.",
file=sys.stderr,
)
return False
print("Success: All Clothes items are correctly marked (packed except hat).")
# Verify SIM Card and Wallet
if not sim_card_found:
print("Error: SIM Card entry not found.", file=sys.stderr)
return False
if not sim_card_packed:
print("Error: SIM Card entry is not checked (packed).", file=sys.stderr)
return False
if not wallet_found:
print("Error: Wallet entry not found.", file=sys.stderr)
return False
if not wallet_packed:
print("Error: Wallet entry is not checked (packed).", file=sys.stderr)
return False
print("Success: SIM Card and Wallet entries are checked.")
except Exception as e:
print(f"Error querying Packing List database: {e}", file=sys.stderr)
return False
# Expected ground truth statistics
expected_stats = {
"Clothes": {"packed": 12, "total": 13},
"Electronics": {"packed": 1, "total": 10},
"Essentials": {"packed": 1, "total": 12},
"Miscellaneous": {"packed": 0, "total": 10},
"Shoes": {"packed": 0, "total": 2},
"Toiletries": {"packed": 0, "total": 19},
}
# Verify Packing Progress Summary section
# Re-fetch blocks to get updated content
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# Find the Packing List heading again and check blocks after it
packing_heading_index = None
for i, block in enumerate(all_blocks):
if block.get("id") == packing_list_heading_id:
packing_heading_index = i
break
summary_found = False
statistics_verified = True
found_statistics = {}
if packing_heading_index is not None:
# Look for summary in the next few blocks
for i in range(
packing_heading_index + 1, min(packing_heading_index + 15, len(all_blocks))
):
block = all_blocks[i]
block_text = notion_utils.get_block_plain_text(block)
# Check for "Packing Progress Summary" paragraph
if "Packing Progress Summary" in block_text:
summary_found = True
# Check if it's bold
if block.get("type") == "paragraph":
rich_text_list = block.get("paragraph", {}).get("rich_text", [])
for text_obj in rich_text_list:
if "Packing Progress Summary" in text_obj.get("text", {}).get(
"content", ""
):
if not text_obj.get("annotations", {}).get("bold", False):
print(
"Error: 'Packing Progress Summary' text is not bold.",
file=sys.stderr,
)
return False
# Check for statistics bullet points in format "Category: X/Y packed"
if (
block.get("type") == "bulleted_list_item"
and ":" in block_text
and "/" in block_text
and "packed" in block_text
):
# Parse the statistic line
# Expected format: "Category: X/Y packed"
try:
parts = block_text.split(":")
if len(parts) >= 2:
category = parts[0].strip()
stats_part = parts[1].strip()
# Extract X/Y from "X/Y packed"
if "/" in stats_part and "packed" in stats_part:
nums = stats_part.split("packed")[0].strip()
if "/" in nums:
x_str, y_str = nums.split("/")
x = int(x_str.strip())
y = int(y_str.strip())
found_statistics[category] = {"packed": x, "total": y}
except:
pass # Continue if parsing fails
if not summary_found:
print(
"Error: 'Packing Progress Summary' section not found after Packing List heading.",
file=sys.stderr,
)
return False
if not found_statistics:
print(
"Error: No valid packing statistics bullet points found in format 'Category: X/Y packed'.",
file=sys.stderr,
)
return False
# Verify the statistics match the expected values
for category, stats in expected_stats.items():
if category not in found_statistics:
print(
f"Error: Category '{category}' missing from Packing Progress Summary.",
file=sys.stderr,
)
statistics_verified = False
else:
found = found_statistics[category]
if found["packed"] != stats["packed"] or found["total"] != stats["total"]:
print(
f"Error: Statistics mismatch for '{category}': expected {stats['packed']}/{stats['total']} packed, found {found['packed']}/{found['total']} packed.",
file=sys.stderr,
)
statistics_verified = False
# Check for extra categories in summary that don't exist in expected
for category in found_statistics:
if category not in expected_stats:
print(
f"Error: Unexpected category '{category}' in summary.", file=sys.stderr
)
statistics_verified = False
if not statistics_verified:
return False
print("Success: Packing Progress Summary section created with correct statistics.")
# print(f"Verified statistics: {', '.join(f'{k}: {v['packed']}/{v['total']} packed' for k, v in expected_stats.items())}")
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/description.md
================================================
Go to Japan Travel Planner and remove the itinerary in OSAKA after 6 PM (excluding 6 PM) in Day 1 and Day 2.
================================================
FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/meta.json
================================================
{
"task_id": "remove_osaka_itinerary",
"task_name": "Remove Osaka Itinerary",
"category_id": "japan_travel_planner",
"category_name": "Japan Travel Planner",
"description": "Remove the itinerary items in Osaka after 6 PM from Day 1 and Day 2 travel schedules.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"conditional filtering",
"automated migration"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
}
}
================================================
FILE: tasks/notion/standard/japan_travel_planner/remove_osaka_itinerary/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def get_page_title(page_result):
"""Extract title from a page result"""
properties = page_result.get('properties', {})
name_property = properties.get('Name', {})
if name_property.get('type') == 'title':
title_array = name_property.get('title', [])
if title_array and len(title_array) > 0:
return title_array[0].get('plain_text', '')
return ''
def get_page_time(page_result):
"""Extract time from Notes field"""
properties = page_result.get('properties', {})
notes_property = properties.get('Notes', {})
if notes_property.get('type') == 'rich_text':
rich_text_array = notes_property.get('rich_text', [])
if rich_text_array and len(rich_text_array) > 0:
notes_text = rich_text_array[0].get('plain_text', '')
return notes_text.strip()
return ''
def get_page_group(page_result):
"""Extract group/location from page"""
properties = page_result.get('properties', {})
group_property = properties.get('Group', {})
if group_property.get('type') == 'select':
select = group_property.get('select')
if select:
return select.get('name', '')
return ''
def get_page_day(page_result):
"""Extract day from page"""
properties = page_result.get('properties', {})
day_property = properties.get('Day', {})
if day_property.get('type') == 'select':
select = day_property.get('select')
if select:
return select.get('name', '')
return ''
def parse_time_to_minutes(time_str):
"""Convert time string to minutes for comparison
Returns None if time cannot be parsed"""
if not time_str:
return None
# Clean the time string
time_str = time_str.strip().upper()
# Remove any text after the time (e.g., "7:30 PM\n" -> "7:30 PM")
time_str = time_str.split('\n')[0].strip()
# Extract time components
try:
if 'PM' in time_str:
time_part = time_str.replace('PM', '').strip()
if ':' in time_part:
hours, minutes = time_part.split(':')
hours = int(hours)
minutes = int(minutes)
else:
hours = int(time_part)
minutes = 0
# Convert PM hours (add 12 for PM times except 12 PM)
if hours != 12:
hours += 12
return hours * 60 + minutes
elif 'AM' in time_str:
time_part = time_str.replace('AM', '').strip()
if ':' in time_part:
hours, minutes = time_part.split(':')
hours = int(hours)
minutes = int(minutes)
else:
hours = int(time_part)
minutes = 0
# Handle 12 AM (midnight)
if hours == 12:
hours = 0
return hours * 60 + minutes
except:
return None
return None
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that all OSAKA events after 6PM have been removed from Day 1 and Day 2 in the Japan Travel Planner.
Expected items that should be deleted (all in OSAKA, after 6PM, on Day 1 or Day 2):
1. Rikuro's Namba Main Branch - 7 PM (Day 1)
2. Shin Sekai "New World" - 8 PM (Day 2)
3. Katsudon Chiyomatsu - 7:30 PM (Day 2)
4. Ebisubashi Bridge - 9 PM (Day 1)
Note: Kuromon Ichiba Market at 6 PM should NOT be deleted (it's at 6PM, not after)
Items after 6PM on other days (Day 3-8) should NOT be deleted
"""
# Step 1: Find the main Japan Travel Planner page
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != 'page':
print("Error: Japan Travel Planner page not found.", file=sys.stderr)
return False
else:
# Try to find the page by searching
found_id = notion_utils.find_page(notion, "Japan Travel Planner")
if not found_id:
print("Error: Japan Travel Planner page not found.", file=sys.stderr)
return False
print(f"Found Japan Travel Planner page: {found_id}")
# Step 2: Find the Travel Itinerary database
all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
travel_itinerary_db_id = None
for block in all_blocks:
if block and block.get("type") == "child_database":
title = block.get("child_database", {}).get("title", "")
if "Travel Itinerary" in title:
travel_itinerary_db_id = block.get("id")
print(f"Found Travel Itinerary database: {travel_itinerary_db_id}")
break
if not travel_itinerary_db_id:
print("Error: Travel Itinerary database not found", file=sys.stderr)
return False
# Step 3: Query the database for OSAKA items on Day 1 and Day 2
try:
query_result = notion.databases.query(
database_id=travel_itinerary_db_id,
filter={
"and": [
{"property": "Group", "select": {"equals": "Osaka"}},
{"or": [
{"property": "Day", "select": {"equals": "Day 1"}},
{"property": "Day", "select": {"equals": "Day 2"}}
]}
]
}
)
except Exception as e:
print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr)
return False
# Step 4: Check for items that should have been deleted
six_pm_minutes = 18 * 60 # 6 PM in minutes (18:00)
# Expected deleted items (4 specific items after 6 PM on Day 1 and Day 2)
expected_deleted = {
"Rikuro's Namba Main Branch": {"time": "7 PM", "day": "Day 1", "found": False},
"Shin Sekai \"New World\"": {"time": "8 PM", "day": "Day 2", "found": False},
"Katsudon Chiyomatsu": {"time": "7:30 PM", "day": "Day 2", "found": False},
"Ebisubashi Bridge": {"time": "9 PM", "day": "Day 1", "found": False}
}
# Items that should remain (at or before 6 PM)
expected_remaining = {
"Kuromon Ichiba Market": {"time": "6 PM", "found": False}
}
osaka_items_after_6pm = []
osaka_items_at_or_before_6pm = []
# Debug: Show total query results
print(f"Debug: Found {len(query_result.get('results', []))} total OSAKA items on Day 1 and Day 2")
# Process all OSAKA items on Day 1 and Day 2
for page in query_result.get('results', []):
page_title = get_page_title(page).strip()
page_time = get_page_time(page)
page_group = get_page_group(page)
page_day = get_page_day(page)
if page_group != "Osaka":
continue
# Parse time to check if after 6 PM
time_minutes = parse_time_to_minutes(page_time)
if time_minutes is not None and time_minutes > six_pm_minutes:
osaka_items_after_6pm.append({
"title": page_title,
"time": page_time,
"day": page_day,
"id": page.get('id')
})
# Check if this is one of the expected deleted items
for expected_title, expected_info in expected_deleted.items():
# Clean up the titles for comparison
clean_page_title = page_title.strip().lower()
clean_expected_title = expected_title.strip().lower()
# Check for "Rikuro's" or "Rikuro's" (different apostrophe types)
if "rikuro" in clean_page_title and "rikuro" in clean_expected_title:
title_match = True
elif clean_page_title == clean_expected_title:
title_match = True
elif clean_expected_title in clean_page_title or clean_page_title in clean_expected_title:
title_match = True
else:
title_match = False
if title_match and page_day == expected_info["day"]:
print(f"Debug: Found '{page_title}' on {page_day} at {page_time} - matches expected '{expected_title}'")
expected_deleted[expected_title]["found"] = True
elif time_minutes is not None and time_minutes <= six_pm_minutes:
osaka_items_at_or_before_6pm.append({
"title": page_title,
"time": page_time,
"day": page_day,
"id": page.get('id')
})
# Check if this is one of the expected remaining items
for expected_title in expected_remaining:
if expected_title.lower() in page_title.lower() or page_title.lower() in expected_title.lower():
expected_remaining[expected_title]["found"] = True
# Step 5: Verify results
print(f"\nVerification Summary:")
print(f"=" * 50)
all_passed = True
# Check that the 4 expected items after 6 PM have been deleted
print("\n4 Items that should be deleted (after 6 PM on Day 1 and Day 2):")
for item_name, item_info in expected_deleted.items():
if item_info["found"]:
# If found = True, it means the item still exists (was not deleted)
print(f"✗ {item_name} ({item_info['day']}, {item_info['time']}) - Still exists, should be deleted", file=sys.stderr)
all_passed = False
else:
# If found = False, it means the item was deleted correctly
print(f"✓ {item_name} ({item_info['day']}, {item_info['time']}) - Correctly deleted")
# Check that items at or before 6 PM remain
print("\nItems that should remain (at or before 6 PM on Day 1 and Day 2):")
for item_name, item_info in expected_remaining.items():
if item_info["found"]:
print(f"✓ {item_name} ({item_info['time']}) - Correctly retained")
else:
print(f"✗ {item_name} ({item_info['time']}) - Missing, should not be deleted", file=sys.stderr)
all_passed = False
# Report any items after 6 PM that still exist
if osaka_items_after_6pm:
print(f"\n✗ Found {len(osaka_items_after_6pm)} OSAKA item(s) after 6 PM on Day 1/Day 2:", file=sys.stderr)
for item in osaka_items_after_6pm:
print(f" - {item['title']} at {item['time']} ({item['day']})", file=sys.stderr)
else:
print(f"\n✓ No OSAKA items found after 6 PM on Day 1/Day 2 (all correctly deleted)")
# Report count summary
print(f"\nCount Summary:")
print(f"- OSAKA items after 6 PM on Day 1/Day 2 found: {len(osaka_items_after_6pm)} (should be 0)")
print(f"- OSAKA items at/before 6 PM on Day 1/Day 2 found: {len(osaka_items_at_or_before_6pm)}")
print(f"- Expected deletions verified: {sum(1 for item in expected_deleted.values() if not item['found'])}/4")
return all_passed
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
print("\nVerification passed: All 4 required OSAKA events after 6 PM on Day 1 and Day 2 have been removed")
sys.exit(0)
else:
print("\nVerification failed: Some OSAKA events after 6 PM on Day 1/Day 2 still exist")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/description.md
================================================
Please find the restaurants that appear in Day 1 of the Travel Itinerary database, then create corresponding entries in the Expenses database, one restaurant per entry. Set the date uniformly to Jan 1, 2025, and the cost uniformly to $120. Display the restaurant name in the Expense field. Set Category to Dining. For Comment, use the Description from the corresponding restaurant page. Leave other properties empty.
================================================
FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/meta.json
================================================
{
"task_id": "restaurant_expenses_sync",
"task_name": "Restaurant Expenses Sync",
"category_id": "japan_travel_planner",
"category_name": "Japan Travel Planner",
"description": "Find restaurants from Day 1 Travel Itinerary and create corresponding entries in the Expenses database.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"conditional filtering",
"database manipulation",
"cross-reference linking",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Japan-Travel-Planner-23181626b6d781c4b6bedb12786b5abe",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/japantravelplanner101"
}
}
================================================
FILE: tasks/notion/standard/japan_travel_planner/restaurant_expenses_sync/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that restaurants from Day 1 of Travel Itinerary have corresponding expense entries.
"""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Japan Travel Planner")
if not page_id:
print("Error: Page 'Japan Travel Planner' not found.", file=sys.stderr)
return False
# Find Travel Itinerary database
itinerary_db_id = notion_utils.find_database_in_block(
notion, page_id, "Travel Itinerary"
)
if not itinerary_db_id:
print("Error: Database 'Travel Itinerary' not found.", file=sys.stderr)
return False
# Find Expenses database
expenses_db_id = notion_utils.find_database_in_block(notion, page_id, "Expenses")
if not expenses_db_id:
print("Error: Database 'Expenses' not found.", file=sys.stderr)
return False
# Find Japan Places to Visit database
places_db_id = notion_utils.find_database_in_block(
notion, page_id, "Travel Itinerary"
)
if not places_db_id:
print("Error: Database 'Japan Places to Visit' not found.", file=sys.stderr)
return False
# Query Day 1 restaurants from Travel Itinerary
try:
itinerary_results = notion.databases.query(
database_id=itinerary_db_id,
filter={
"and": [
{"property": "Day", "select": {"equals": "Day 1"}},
{"property": "Type", "multi_select": {"contains": "Food"}},
]
},
).get("results", [])
except Exception as e:
print(f"Error querying Travel Itinerary database: {e}", file=sys.stderr)
return False
if not itinerary_results:
print(
"Error: No restaurants found for Day 1 in Travel Itinerary.",
file=sys.stderr,
)
return False
# Extract restaurant names
restaurant_names = []
for entry in itinerary_results:
props = entry.get("properties", {})
name_prop = props.get("Name", {})
name_text = "".join(t.get("plain_text", "") for t in name_prop.get("title", []))
if name_text:
restaurant_names.append(name_text.strip())
if not restaurant_names:
print("Error: No restaurant names found in Day 1 entries.", file=sys.stderr)
return False
# Get descriptions from Japan Places to Visit database
try:
places_results = notion.databases.query(database_id=places_db_id).get(
"results", []
)
except Exception as e:
print(f"Error querying Japan Places to Visit database: {e}", file=sys.stderr)
return False
# Create a map of restaurant names to descriptions
restaurant_descriptions = {}
for place in places_results:
props = place.get("properties", {})
name_prop = props.get("Name", {})
name_text = "".join(t.get("plain_text", "") for t in name_prop.get("title", []))
desc_prop = props.get("Description", {})
desc_text = "".join(
t.get("plain_text", "") for t in desc_prop.get("rich_text", [])
)
if name_text and desc_text:
restaurant_descriptions[name_text.strip()] = desc_text.strip()
# Query Expenses database
try:
expenses_results = notion.databases.query(database_id=expenses_db_id).get(
"results", []
)
except Exception as e:
print(f"Error querying Expenses database: {e}", file=sys.stderr)
return False
# Verify each restaurant has a corresponding expense entry
verified_restaurants = []
for restaurant_name in restaurant_names:
found_matching_expense = False
expected_description = restaurant_descriptions.get(restaurant_name, "")
for expense in expenses_results:
props = expense.get("properties", {})
# Check Expense field (title)
expense_prop = props.get("Expense", {})
expense_text = "".join(
t.get("plain_text", "") for t in expense_prop.get("title", [])
)
if expense_text.strip() != restaurant_name:
continue
# Check Date
date_prop = props.get("Date", {})
date_start = date_prop.get("date", {}).get("start")
if date_start != "2025-01-01":
continue
# Check Transaction Amount
amount_prop = props.get("Transaction Amount", {})
amount = amount_prop.get("number")
if amount != 120:
continue
# Check Category contains Dining
category_prop = props.get("Category", {})
categories = [c.get("name") for c in category_prop.get("multi_select", [])]
if "Dining" not in categories:
continue
# Check Comment matches description (if description exists)
if expected_description:
comment_prop = props.get("Comment", {})
comment_text = "".join(
t.get("plain_text", "") for t in comment_prop.get("rich_text", [])
)
if comment_text.strip().replace(
"\u202f", " "
) != expected_description.replace("\u202f", " "):
continue
found_matching_expense = True
verified_restaurants.append(restaurant_name)
break
if not found_matching_expense:
print(
f"Error: No matching expense entry found for restaurant '{restaurant_name}'.",
file=sys.stderr,
)
return False
if len(verified_restaurants) == len(restaurant_names):
print(
f"Success: Found matching expense entries for all {len(restaurant_names)} Day 1 restaurants."
)
return True
else:
print(
f"Error: Only {len(verified_restaurants)} out of {len(restaurant_names)} restaurants have matching expense entries.",
file=sys.stderr,
)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/online_resume/layout_adjustment/description.md
================================================
Please go to my Online Resume page and adjust the Skills display with the following requirements:
## Skills Section Adjustment
1. Delete the Skills database from the right side of the page
2. Add a new Skills section on the left side, under the Languages section
3. Format skills as "[icon] skill description (type)", for example "✨✨ Photoshop (Design Tool)"
- Use ✨✨ icon for skills with level >= 50%
- Use ✨ icon for skills with level < 50%
## Work History and Education Layout Adjustment
1. Adjust the layout so that logo/image columns take up 50% width in each section
- Note: Column width ratio might not be returned by API when columns are equal (50/50)
2. Replace all images/icons with black placeholder images using URL containing "https://singlecolorimage.com/get/000000/1024x128"
================================================
FILE: tasks/notion/standard/online_resume/layout_adjustment/meta.json
================================================
{
"task_id": "layout_adjustment",
"task_name": "Layout Adjustment",
"category_id": "online_resume",
"category_name": "Online Resume",
"description": "This task involves modifying the layout and content of an online resume page by restructuring the Skills section with icon indicators and adjusting the Work History and Education sections to use equal column widths with placeholder images.",
"author": "Xiangyan Liu",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"content organization",
"visual formatting",
"conditional filtering",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
}
}
================================================
FILE: tasks/notion/standard/online_resume/layout_adjustment/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Skills display has been adjusted correctly:
1. Skills database on the right side should be deleted
2. Skills section should be added on the left side under Languages
3. Skills should be formatted with correct icons based on skill level
4. Work History and Education sections should use black placeholder images
"""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Online Resume")
if not page_id:
print("Error: Page 'Online Resume' not found.", file=sys.stderr)
return False
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# Step 1: Verify Skills database is NOT in the right column anymore
# Find the main column list
for block in all_blocks:
if block.get("type") == "column_list":
column_list_id = block["id"]
columns = notion_utils.get_all_blocks_recursively(notion, column_list_id)
# Check if this is the main two-column layout
if len(columns) == 2:
# Find the right column (usually the one with larger width ratio)
for column in columns:
if column.get("type") == "column":
width_ratio = column.get("column", {}).get("width_ratio", 0)
# Right column typically has width_ratio > 0.5
if width_ratio > 0.5:
right_column_id = column["id"]
right_column_blocks = notion_utils.get_all_blocks_recursively(
notion, right_column_id
)
# Check if Skills database exists in right column
for right_block in right_column_blocks:
if (
right_block.get("type") == "child_database"
and right_block.get("child_database", {}).get("title") == "Skills"
):
print(
"Error: Skills database still exists in the right column.",
file=sys.stderr,
)
return False
# Step 2: Find the left column and verify Skills section exists there
skills_section_found = False
skills_with_double_sparkles = []
skills_with_single_sparkle = []
# First, find the main column_list (top-level)
main_column_list_id = None
for block in all_blocks:
if block.get("type") == "column_list" and block.get("parent", {}).get("type") == "page_id":
main_column_list_id = block["id"]
break
if not main_column_list_id:
print("Error: Main column list not found.", file=sys.stderr)
return False
# Get the columns directly
columns = notion_utils.get_all_blocks_recursively(notion, main_column_list_id)
# Find the left column (the one with width_ratio around 0.25)
left_column_id = None
for column in columns:
if column.get("type") == "column":
width_ratio = column.get("column", {}).get("width_ratio", 0)
# Left column has width_ratio around 0.25
if 0.2 <= width_ratio <= 0.3:
left_column_id = column["id"]
break
if not left_column_id:
print("Error: Left column not found.", file=sys.stderr)
return False
# Get all blocks in the left column
left_column_blocks = notion_utils.get_all_blocks_recursively(notion, left_column_id)
# Find Languages heading
languages_index = -1
for i, left_block in enumerate(left_column_blocks):
if (
left_block.get("type") == "heading_2"
and "Languages" in notion_utils.get_block_plain_text(left_block)
):
languages_index = i
break
if languages_index == -1:
print("Error: Languages heading not found in left column.", file=sys.stderr)
return False
# Look for Skills heading after Languages
for i in range(languages_index + 1, len(left_column_blocks)):
left_block = left_column_blocks[i]
if (
left_block.get("type") == "heading_2"
and "Skills" in notion_utils.get_block_plain_text(left_block)
):
skills_section_found = True
# Check divider after Skills heading
if i + 1 < len(left_column_blocks):
next_block = left_column_blocks[i + 1]
if next_block.get("type") != "divider":
print(
"Error: Divider not found after Skills heading.",
file=sys.stderr,
)
return False
# Collect skills after divider
for j in range(i + 2, len(left_column_blocks)):
skill_block = left_column_blocks[j]
if skill_block.get("type") == "paragraph":
skill_text = notion_utils.get_block_plain_text(skill_block)
if skill_text and skill_text.strip(): # Check for non-empty text
# Check if text is bold
rich_text = skill_block.get("paragraph", {}).get("rich_text", [])
if rich_text and not rich_text[0].get("annotations", {}).get("bold"):
print(
f"Error: Skill '{skill_text}' is not bold.",
file=sys.stderr,
)
return False
# Check icon format
if skill_text.startswith("✨✨"):
skills_with_double_sparkles.append(skill_text)
elif skill_text.startswith("✨"):
skills_with_single_sparkle.append(skill_text)
else:
print(
f"Error: Skill '{skill_text}' doesn't start with sparkle icon.",
file=sys.stderr,
)
return False
# Check format includes type in parentheses
if "(" not in skill_text or ")" not in skill_text:
print(
f"Error: Skill '{skill_text}' doesn't include type in parentheses.",
file=sys.stderr,
)
return False
elif skill_block.get("type") in ["heading_1", "heading_2", "heading_3"]:
# Stop when we reach another section
break
break
if not skills_section_found:
print(
"Error: Skills section not found in the left column under Languages.",
file=sys.stderr,
)
return False
# Step 3: Verify we have the expected skills
expected_double_sparkle_skills = [
"Photoshop",
"Figma",
"Notion",
"Framer"
]
expected_single_sparkle_skills = [
"Webflow",
"Rive",
"CSS + Basic JS"
]
# Check if all expected skills are present
for skill_name in expected_double_sparkle_skills:
found = any(skill_name in skill for skill in skills_with_double_sparkles)
if not found:
print(
f"Error: Expected skill '{skill_name}' with ✨✨ not found.",
file=sys.stderr,
)
return False
for skill_name in expected_single_sparkle_skills:
found = any(skill_name in skill for skill in skills_with_single_sparkle)
if not found:
print(
f"Error: Expected skill '{skill_name}' with ✨ not found.",
file=sys.stderr,
)
return False
# Step 4: Verify Work History and Education sections have black placeholder images
work_history_images_found = 0
education_images_found = 0
black_placeholder_url = "https://singlecolorimage.com/get/000000/"
# Find Work History and Education sections in the right column
right_column_id = None
for column in columns:
if column.get("type") == "column":
width_ratio = column.get("column", {}).get("width_ratio", 0.5)
# Right column has width_ratio around 0.75 or no width_ratio (which means equal split)
if width_ratio > 0.6 or width_ratio == 0.5:
right_column_id = column["id"]
break
if right_column_id:
right_column_blocks = notion_utils.get_all_blocks_recursively(notion, right_column_id)
# Find Work History section
work_history_index = -1
education_index = -1
for i, block in enumerate(right_column_blocks):
if block.get("type") == "heading_1":
heading_text = notion_utils.get_block_plain_text(block)
if "Work History" in heading_text:
work_history_index = i
elif "Education" in heading_text:
education_index = i
# Check Work History column lists for images
if work_history_index != -1:
for i in range(work_history_index + 1, min(education_index if education_index > work_history_index else len(right_column_blocks), len(right_column_blocks))):
block = right_column_blocks[i]
if block.get("type") == "column_list":
column_list_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"])
for column in column_list_blocks:
if column.get("type") == "column":
# Check width_ratio - must be 50% (0.5) or absent (which defaults to 50%)
col_width = column.get("column", {}).get("width_ratio")
# First column should be image column (either no ratio=50%, or exactly 0.5)
if col_width is None or col_width == 0.5:
column_contents = notion_utils.get_all_blocks_recursively(notion, column["id"])
for content_block in column_contents:
if content_block.get("type") == "embed":
embed_url = content_block.get("embed", {}).get("url", "")
if black_placeholder_url in embed_url:
work_history_images_found += 1
elif content_block.get("type") == "image":
# Also check for image blocks with external URL
image_url = content_block.get("image", {}).get("external", {}).get("url", "")
if black_placeholder_url in image_url:
work_history_images_found += 1
break # Only check first column
# Check Education column list for images
if education_index != -1:
for i in range(education_index + 1, len(right_column_blocks)):
block = right_column_blocks[i]
if block.get("type") == "heading_1":
break # Stop at next section
if block.get("type") == "column_list":
column_list_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"])
for column in column_list_blocks:
if column.get("type") == "column":
# Check width_ratio - must be 50% (0.5) or absent (which defaults to 50%)
col_width = column.get("column", {}).get("width_ratio")
# First column should be image column (either no ratio=50%, or exactly 0.5)
if col_width is None or col_width == 0.5:
column_contents = notion_utils.get_all_blocks_recursively(notion, column["id"])
for content_block in column_contents:
if content_block.get("type") == "embed":
embed_url = content_block.get("embed", {}).get("url", "")
if black_placeholder_url in embed_url:
education_images_found += 1
elif content_block.get("type") == "image":
image_url = content_block.get("image", {}).get("external", {}).get("url", "")
if black_placeholder_url in image_url:
education_images_found += 1
break # Only check first column
break # Only check first column_list in Education
# Verify images were found
if work_history_images_found < 2:
print(
f"Warning: Expected at least 2 Work History images with black placeholder, found {work_history_images_found}.",
file=sys.stderr,
)
return False
if education_images_found < 1:
print(
f"Warning: Expected at least 1 Education image with black placeholder, found {education_images_found}.",
file=sys.stderr,
)
return False
print("Success: Skills display adjusted correctly.")
print(f"- Found {len(skills_with_double_sparkles)} skills with ✨✨ (skill level >= 50%)")
print(f"- Found {len(skills_with_single_sparkle)} skills with ✨ (skill level < 50%)")
print("- Skills database removed from right column")
print("- Skills section added to left column under Languages")
print(f"- Found {work_history_images_found} Work History images with black placeholder")
print(f"- Found {education_images_found} Education images with black placeholder")
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/online_resume/projects_section_update/description.md
================================================
Find the page named "Online Resume" and reorganize the projects section to showcase only the most recent and relevant work.
**Task Requirements:**
1. Delete the project named "Knitties eComm Website" from the Projects database since it's from 2022 and no longer relevant
2. Create a new project entry called "Zapier Dashboard Redesign" with:
- Description: "Led the complete redesign of Zapier's main dashboard, focusing on improved usability and modern design patterns. Implemented new navigation system and responsive layouts."
- Date: Start "2024-01-01", End "2024-06-30"
- Tags: Add the existing "UI Design" tag, and create a new tag "Enterprise" with purple color, then add both tags to this project
- Phone: Same as the phone number under the Contact section
- Url: Same as the personal website under the Contact section
3. After the Projects database block, add the following blocks in sequence:
- A divider block
- A heading_2 block with text "Current Focus"
- A paragraph block with content that dynamically references:
- The highest skill level from your Skills database (find the skill with the highest Skill Level percentage)
- Incorporate this into the text: "The Zapier Dashboard Redesign represents my most impactful recent work, leveraging my expertise in [highest skill name] ([skill level]%) to deliver enterprise-grade solutions that prioritize both aesthetics and functionality."
================================================
FILE: tasks/notion/standard/online_resume/projects_section_update/meta.json
================================================
{
"task_id": "projects_section_update",
"task_name": "Projects Section Update",
"category_id": "online_resume",
"category_name": "Online Resume",
"description": "Reorganize the projects section by removing outdated projects and adding new relevant work with proper formatting.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"template population",
"data aggregation",
"visual formatting",
"cross-reference linking"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
}
}
================================================
FILE: tasks/notion/standard/online_resume/projects_section_update/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the projects section has been reorganized correctly with cross-section references.
"""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Online Resume")
if not page_id:
print("Error: Page 'Online Resume' not found.", file=sys.stderr)
return False
# Find the Projects database
projects_db_id = notion_utils.find_database_in_block(notion, page_id, "Projects")
if not projects_db_id:
print("Error: Database 'Projects' not found.", file=sys.stderr)
return False
# Find the Skills database to get the highest skill level
skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills")
if not skills_db_id:
print("Error: Database 'Skills' not found.", file=sys.stderr)
return False
# Query Skills database to find the highest skill level
skills_results = notion.databases.query(database_id=skills_db_id).get("results", [])
highest_skill_name = ""
highest_skill_level = 0
for skill_page in skills_results:
properties = skill_page.get("properties", {})
skill_name_prop = properties.get("Skill", {}).get("title", [])
skill_level_prop = properties.get("Skill Level", {}).get("number")
if skill_name_prop and skill_level_prop is not None:
skill_name = skill_name_prop[0].get("text", {}).get("content", "")
if skill_level_prop > highest_skill_level:
highest_skill_level = skill_level_prop
highest_skill_name = skill_name
if not highest_skill_name:
print("Error: Could not find any skills with skill levels.", file=sys.stderr)
return False
# Query Projects database
projects_results = notion.databases.query(database_id=projects_db_id).get(
"results", []
)
# Check that "Knitties eComm Website" is deleted
for page in projects_results:
properties = page.get("properties", {})
name_prop = properties.get("Name", {}).get("title", [])
if (
name_prop
and name_prop[0].get("text", {}).get("content") == "Knitties eComm Website"
):
print(
"Failure: 'Knitties eComm Website' project was not deleted.",
file=sys.stderr,
)
return False
# Check that "Zapier Dashboard Redesign" exists with correct properties
zapier_project_found = False
for page in projects_results:
properties = page.get("properties", {})
name_prop = properties.get("Name", {}).get("title", [])
if (
name_prop
and name_prop[0].get("text", {}).get("content")
== "Zapier Dashboard Redesign"
):
zapier_project_found = True
# Check description contains reference to UI Design Internship
desc_prop = properties.get("Description", {}).get("rich_text", [])
if not desc_prop:
print("Failure: Zapier project has no description.", file=sys.stderr)
return False
description_text = desc_prop[0].get("text", {}).get("content", "")
base_desc = "Led the complete redesign of Zapier's main dashboard, focusing on improved usability and modern design patterns. Implemented new navigation system and responsive layouts."
if base_desc not in description_text:
print(
"Failure: Zapier project description is missing base content.",
file=sys.stderr,
)
return False
# Check date
date_prop = properties.get("Date", {}).get("date", {})
if (
not date_prop
or date_prop.get("start") != "2024-01-01"
or date_prop.get("end") != "2024-06-30"
):
print(
"Failure: Zapier project date range is incorrect.", file=sys.stderr
)
return False
# Check tags
tags_prop = properties.get("Tags", {}).get("multi_select", [])
tag_names = {tag.get("name") for tag in tags_prop}
if "UI Design" not in tag_names or "Enterprise" not in tag_names:
print(
"Failure: Zapier project is missing required tags.", file=sys.stderr
)
return False
# Check phone
phone_prop = properties.get("Phone", {}).get("phone_number", [])
if not phone_prop or phone_prop != "+44 7871263013":
print(
"Failure: Zapier project phone number is incorrect.",
file=sys.stderr,
)
return
# Check url
url_prop = properties.get("Url", {}).get("url", [])
if not url_prop or url_prop != "www.zinenwine.com":
print("Failure: Zapier project url is incorrect.", file=sys.stderr)
return
# Check Enterprise tag color
enterprise_tag_purple = False
for tag in tags_prop:
if tag.get("name") == "Enterprise" and tag.get("color") == "purple":
enterprise_tag_purple = True
break
if not enterprise_tag_purple:
print(
"Failure: Enterprise tag does not have purple color.",
file=sys.stderr,
)
return False
break
if not zapier_project_found:
print(
"Failure: 'Zapier Dashboard Redesign' project not found.", file=sys.stderr
)
return False
# Find the Projects database block and verify blocks after it
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# Find the Projects database block
projects_db_index = -1
for i, block in enumerate(all_blocks):
if (
block.get("type") == "child_database"
and block.get("child_database", {}).get("title") == "Projects"
):
projects_db_index = i
break
if projects_db_index == -1:
print("Error: Could not find Projects database block.", file=sys.stderr)
return False
# Check blocks after Projects database
if projects_db_index + 3 > len(all_blocks):
print("Failure: Not enough blocks after Projects database.", file=sys.stderr)
return False
# Check divider block
divider_block = all_blocks[projects_db_index + 1]
if divider_block.get("type") != "divider":
print(
"Failure: Expected divider block after Projects database.", file=sys.stderr
)
return False
# Check heading block
heading_block = all_blocks[projects_db_index + 2]
if heading_block.get("type") != "heading_2":
print("Failure: Expected heading_2 block after divider.", file=sys.stderr)
return False
heading_text = heading_block.get("heading_2", {}).get("rich_text", [])
if (
not heading_text
or heading_text[0].get("text", {}).get("content") != "Current Focus"
):
print("Failure: Heading text is incorrect.", file=sys.stderr)
return False
# Check paragraph block with dynamic skill reference
paragraph_block = all_blocks[projects_db_index + 3]
if paragraph_block.get("type") != "paragraph":
print("Failure: Expected paragraph block after heading.", file=sys.stderr)
return False
paragraph_text = paragraph_block.get("paragraph", {}).get("rich_text", [])
if not paragraph_text:
print("Failure: Paragraph block is empty.", file=sys.stderr)
return False
paragraph_content = paragraph_text[0].get("text", {}).get("content", "")
# Check that paragraph contains the base text
base_text = "The Zapier Dashboard Redesign represents my most impactful recent work, leveraging my expertise in"
if base_text not in paragraph_content:
print("Failure: Paragraph does not contain base text.", file=sys.stderr)
return False
# Check that paragraph references the highest skill
skill_level_percent = int(highest_skill_level * 100)
expected_skill_ref = f"{highest_skill_name} ({skill_level_percent}%)"
if expected_skill_ref not in paragraph_content:
print(
f"Failure: Paragraph does not reference highest skill '{expected_skill_ref}'.",
file=sys.stderr,
)
return False
# Check that paragraph contains the ending text
ending_text = (
"enterprise-grade solutions that prioritize both aesthetics and functionality"
)
if ending_text not in paragraph_content:
print(
"Failure: Paragraph does not contain proper ending text.", file=sys.stderr
)
return False
print(
f"Success: Projects section has been reorganized correctly with cross-section references (highest skill: {highest_skill_name} at {skill_level_percent}%)."
)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/online_resume/skills_development_tracker/description.md
================================================
Create a comprehensive skills audit system by performing the following tasks:
**Task Requirements:**
1. Create a new database named "Skills Development Tracker" as a child database in the main resume page with the following properties:
- Name (title property)
- Current Skill (relation to Skills database)
- Current Proficiency (rollup from related skill's "Skill Level" property)
- Target Proficiency (number property with format "percent")
- Gap (formula: Target Proficiency - Current Proficiency)
- Learning Resources (rich text property)
- Progress Notes (rich text property)
2. Populate the Skills Development Tracker database with entries for all skills that have a proficiency level below 70% (0.7):
- For each qualifying skill, create an entry with:
- Name: "[Skill Name] Development Plan"
- Link to the corresponding skill in Skills database
- Target Proficiency: Set to Current + 25% (capped at 95%)
- Learning Resources: "Online courses and practice projects"
- Progress Notes: "Initial assessment completed"
3. Create a callout block immediately after the Skills section (after the Skills database) with:
- Background color: blue_background
- Icon: 🎯 (target emoji)
- Content: "Focus Areas: [3 skills with lowest current proficiency]"
================================================
FILE: tasks/notion/standard/online_resume/skills_development_tracker/meta.json
================================================
{
"task_id": "skills_development_tracker",
"task_name": "Skills Development Tracker",
"category_id": "online_resume",
"category_name": "Online Resume",
"description": "Create a comprehensive skills audit system with development tracking for skills below 70% proficiency.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"cross-reference linking",
"conditional filtering",
"data aggregation",
"template population",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
}
}
================================================
FILE: tasks/notion/standard/online_resume/skills_development_tracker/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Skills Development Tracker database and callout block were created correctly.
"""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "New Online Resume")
if not page_id:
print("Error: Page 'New Online Resume' not found.", file=sys.stderr)
return False
# Step 1: Verify Skills Development Tracker database exists
tracker_db_id = notion_utils.find_database_in_block(
notion, page_id, "Skills Development Tracker"
)
if not tracker_db_id:
print(
"Error: Database 'Skills Development Tracker' not found.", file=sys.stderr
)
return False
# Step 2: Verify database schema
try:
db_info = notion.databases.retrieve(database_id=tracker_db_id)
properties = db_info.get("properties", {})
# Check required properties
required_props = {
"Name": "title",
"Current Skill": "relation",
"Current Proficiency": "rollup",
"Target Proficiency": "number",
"Gap": "formula",
"Learning Resources": "rich_text",
"Progress Notes": "rich_text",
}
for prop_name, expected_type in required_props.items():
if prop_name not in properties:
print(
f"Error: Property '{prop_name}' not found in database.",
file=sys.stderr,
)
return False
if properties[prop_name]["type"] != expected_type:
print(
f"Error: Property '{prop_name}' has incorrect type. Expected '{expected_type}', got '{properties[prop_name]['type']}'.",
file=sys.stderr,
)
return False
# Verify Target Proficiency is percent format
if (
properties["Target Proficiency"].get("number", {}).get("format")
!= "percent"
):
print(
"Error: Target Proficiency should have 'percent' format.",
file=sys.stderr,
)
return False
except Exception as e:
print(f"Error retrieving database info: {e}", file=sys.stderr)
return False
# Step 3: Get Skills database to check entries
skills_db_id = notion_utils.find_database_in_block(notion, page_id, "Skills")
if not skills_db_id:
print("Error: Skills database not found.", file=sys.stderr)
return False
# Get all skills with proficiency < 70%
skills_below_70 = []
try:
skills_results = notion.databases.query(database_id=skills_db_id).get(
"results", []
)
for skill in skills_results:
skill_level = (
skill.get("properties", {}).get("Skill Level", {}).get("number", 1.0)
)
if skill_level < 0.7:
skill_name = (
skill.get("properties", {}).get("Skill", {}).get("title", [])
)
if skill_name:
skill_name_text = skill_name[0].get("text", {}).get("content", "")
skills_below_70.append(
{
"name": skill_name_text,
"id": skill["id"],
"level": skill_level,
}
)
except Exception as e:
print(f"Error querying Skills database: {e}", file=sys.stderr)
return False
if not skills_below_70:
print("Warning: No skills found with proficiency below 70%.", file=sys.stderr)
# This might be OK if all skills are above 70%
# Step 4: Verify entries in Skills Development Tracker
try:
tracker_results = notion.databases.query(database_id=tracker_db_id).get(
"results", []
)
# Check that we have entries for skills below 70%
if len(skills_below_70) > 0 and len(tracker_results) == 0:
print(
"Error: No entries found in Skills Development Tracker database.",
file=sys.stderr,
)
return False
# Verify each entry
for entry in tracker_results:
props = entry.get("properties", {})
# Check name format
name_prop = props.get("Name", {}).get("title", [])
if not name_prop:
print("Error: Entry missing Name property.", file=sys.stderr)
return False
name_text = name_prop[0].get("text", {}).get("content", "")
if not name_text.endswith(" Development Plan"):
print(
f"Error: Entry name '{name_text}' doesn't follow expected format.",
file=sys.stderr,
)
return False
# Check relation to Skills database
skill_relation = props.get("Current Skill", {}).get("relation", [])
if not skill_relation:
print(
f"Error: Entry '{name_text}' missing Current Skill relation.",
file=sys.stderr,
)
return False
# Check Target Proficiency (should be set)
target_prof = props.get("Target Proficiency", {}).get("number")
if target_prof is None:
print(
f"Error: Entry '{name_text}' missing Target Proficiency.",
file=sys.stderr,
)
return False
# Check Learning Resources
learning_resources = props.get("Learning Resources", {}).get(
"rich_text", []
)
if not learning_resources:
print(
f"Error: Entry '{name_text}' missing Learning Resources.",
file=sys.stderr,
)
return False
# Check Progress Notes
progress_notes = props.get("Progress Notes", {}).get("rich_text", [])
if not progress_notes:
print(
f"Error: Entry '{name_text}' missing Progress Notes.",
file=sys.stderr,
)
return False
except Exception as e:
print(f"Error querying Skills Development Tracker: {e}", file=sys.stderr)
return False
# Step 5: Verify callout block exists after Skills section
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# Find Skills database block
skills_db_block_index = None
for i, block in enumerate(all_blocks):
if (
block.get("type") == "child_database"
and block.get("child_database", {}).get("title") == "Skills"
):
skills_db_block_index = i
break
if skills_db_block_index is None:
print("Error: Could not find Skills database block.", file=sys.stderr)
return False
# Look for callout block after Skills database
callout_found = False
block = all_blocks[skills_db_block_index + 1]
if block.get("type") == "callout":
callout_data = block.get("callout", {})
# Check background color
if callout_data.get("color") != "blue_background":
print("Error: Could not find callout block with blue background.")
return False
# Check icon
icon = callout_data.get("icon", {})
if icon.get("type") != "emoji" or icon.get("emoji") != "🎯":
print("Error: Could not find callout block with 🎯 emoji.")
return False
# Check content starts with "Focus Areas:"
rich_text = callout_data.get("rich_text", [])
if rich_text:
content = rich_text[0].get("text", {}).get("content", "")
if (
content.startswith("Focus Areas:")
and "CSS + Basic JS" in content
and "Webflow" in content
and "Rive" in content
):
callout_found = True
print(f"Success: Found callout block with content: {content}")
else:
print("Error: Could not find callout block with required text content.")
return False
if not callout_found:
print(
"Error: Could not find callout block with Focus Areas after Skills section.",
file=sys.stderr,
)
return False
print(
"Success: Skills Development Tracker database and callout block verified successfully."
)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/online_resume/work_history_addition/description.md
================================================
Hi! I realized I forgot to include one work experience on my resume page titled "Online Resume." Could you please help me add it to the "Work History" section?
The position is "Research Assistant," and it took place from January to August 2023. The description should be: "Assisted in conducting user experience research projects at my bachelor’s program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams."
For the image or logo, please use the one from the "Education" section (my bachelor school) to keep everything consistent.
Also, please make sure that the formatting — including font style, size, and layout — matches the existing entries in the Work History section so it looks seamless.
Thank you!
================================================
FILE: tasks/notion/standard/online_resume/work_history_addition/meta.json
================================================
{
"task_id": "work_history_addition",
"task_name": "Work History Addition",
"category_id": "online_resume",
"category_name": "Online Resume",
"description": "Add a Research Assistant position to the Work History section with consistent formatting and university logo.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"database manipulation",
"template population",
"cross-reference linking",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Online-Resume-23181626b6d781159faaeb5eadaf612e",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/online-resume"
}
}
================================================
FILE: tasks/notion/standard/online_resume/work_history_addition/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the new work history entry for 'Research Assistant' has been added correctly.
"""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Online Resume")
if not page_id:
print("Error: Page 'Online Resume' not found.", file=sys.stderr)
return False
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
def find_image_url_under_heading(blocks, heading_text, notion_client):
heading_index = -1
for i, block in enumerate(blocks):
block_type = block.get("type")
if block_type == "heading_1":
if heading_text in notion_utils.get_block_plain_text(block):
heading_index = i
break
if heading_index == -1:
return None
for i in range(heading_index + 1, len(blocks)):
block = blocks[i]
if block.get("type") in ["heading_1", "heading_2", "heading_3"]:
break
if block.get("type") == "image" and block.get("image", {}).get("file"):
return block.get("image", {}).get("file", {}).get("url")
if block.get("type") == "column_list":
column_list_id = block["id"]
columns = notion_utils.get_all_blocks_recursively(
notion_client, column_list_id
)
for column in columns:
if column.get("type") == "column":
column_id = column["id"]
column_blocks = notion_utils.get_all_blocks_recursively(
notion_client, column_id
)
for inner_block in column_blocks:
if inner_block.get("type") == "image" and inner_block.get(
"image", {}
).get("file"):
return (
inner_block.get("image", {})
.get("file", {})
.get("url")
)
return None
def get_block_annotations(block):
block_type = block.get("type")
if not block_type:
return {}
block_content = block.get(block_type)
if not block_content:
return {}
rich_text_list = block_content.get("rich_text", [])
if not rich_text_list:
return {}
return rich_text_list[0].get("annotations", {})
education_image_url = find_image_url_under_heading(all_blocks, "Education", notion)
if not education_image_url:
print(
"Error: Could not find the image in the 'Education' section.",
file=sys.stderr,
)
return False
heading_text = "Work History"
heading_index = -1
for i, block in enumerate(all_blocks):
if block.get(
"type"
) == "heading_1" and heading_text in notion_utils.get_block_plain_text(block):
heading_index = i
break
if heading_index == -1:
print(f"Error: Could not find the '{heading_text}' heading.", file=sys.stderr)
return False
for i in range(heading_index + 1, len(all_blocks)):
block = all_blocks[i]
if block.get("type") in ["heading_1", "heading_2", "heading_3"]:
break
if block.get("type") == "column_list":
column_list_id = block["id"]
columns = notion_utils.get_all_blocks_recursively(notion, column_list_id)
if len(columns) < 2:
continue
for column in columns:
if column.get("type") == "column":
if column.get("column", {}).get("width_ratio") == 0.125:
image_column = column
elif column.get("column", {}).get("width_ratio") == 0.875:
text_column = column
image_column_blocks = notion_utils.get_all_blocks_recursively(
notion, image_column["id"]
)
text_column_blocks = notion_utils.get_all_blocks_recursively(
notion, text_column["id"]
)
column_image_url = None
for inner_block in image_column_blocks:
if inner_block.get("type") == "image" and inner_block.get(
"image", {}
).get("file"):
column_image_url = (
inner_block.get("image", {}).get("file", {}).get("url")
)
break
if (
not column_image_url
or column_image_url[:100] != education_image_url[:100]
):
continue
for j, inner_block in enumerate(text_column_blocks):
if "Research Assistant" in notion_utils.get_block_plain_text(
inner_block
):
title_annotations = get_block_annotations(inner_block)
if j + 2 < len(text_column_blocks):
date_block = text_column_blocks[j + 1]
description_block = text_column_blocks[j + 2]
date_text = "January - August 2023"
description_text = "Assisted in conducting user experience research projects at my bachelor’s program, supporting data collection, analyzing user feedback, and preparing research reports. Developed strong skills in research methodologies and improved collaboration with interdisciplinary teams."
date_annotations = get_block_annotations(date_block)
description_annotations = get_block_annotations(
description_block
)
if (
date_text in notion_utils.get_block_plain_text(date_block)
and description_text
in notion_utils.get_block_plain_text(description_block)
and title_annotations.get("bold")
and date_annotations.get("italic")
and date_annotations.get("color") == "gray"
and description_annotations.get("color") == "default"
and description_annotations.get("italic") != True
and description_annotations.get("bold") != True
):
print("Success: Verified new work history entry.")
return True
print("Failure: Could not verify the new work history entry.", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/description.md
================================================
# Task: Expert Level Learning Path with Complex Prerequisites
## Objective
Create an Expert Level chapter in the Python Roadmap with sophisticated prerequisite chains that require deep understanding of the existing course structure.
## Requirements
### 1. Create Expert Level Chapter
- **Database**: Chapters database
- **Properties**:
- Name: `Expert Level`
- Icon: 🟣 (purple circle emoji)
- Must appear after Advanced Level in the database
### 2. Create Bridge Lesson
Create a lesson that bridges advanced and expert content:
- **Title**: `Advanced Foundations Review`
- **Status**: Done
- **Chapter**: Link to Expert Level
- **Parent item**: Link to the lesson that currently has status "In Progress" and contains "Control" in its title
- **Sub-items**: Must link to exactly these three lessons:
- The lesson with title containing "Decorators"
- The lesson with title containing "Calling API"
- The lesson with title containing "Regular Expressions"
### 3. Create Expert Level Lessons
Add exactly 4 expert lessons to the Steps database:
**Lesson 1**: `Metaprogramming and AST Manipulation`
- Status: To Do
- Chapter: Expert Level
- Parent item: Link to "Advanced Foundations Review"
- Date: 2025-09-15
**Lesson 2**: `Async Concurrency Patterns`
- Status: To Do
- Chapter: Expert Level
- Parent item: Link to the lesson titled "Calling API"
- Date: 2025-09-20
**Lesson 3**: `Memory Management and GC Tuning`
- Status: In Progress
- Chapter: Expert Level
- Parent item: Link to "Advanced Foundations Review"
- Sub-item: Must have exactly 2 links:
- Link to any lesson from "Data Structures" that has status "To Do"
- Link to the lesson containing "OOP" in its title
- Date: 2025-09-25
**Lesson 4**: `Building Python C Extensions`
- Status: To Do
- Chapter: Expert Level
- Parent item: Link to "Metaprogramming and AST Manipulation"
- Date: 2025-10-01
### 4. Update Existing Lessons
- Change the status of "Decorators" from "To Do" to "Done"
- Add "Async Concurrency Patterns" as a Sub-item to "Error Handling"
- Update "Control Flow" status from "In Progress" to "Done"
### 5. Create Learning Path Notes
Add content to the "Advanced Foundations Review" lesson page:
- **Block 1**: Heading 2 with text `Prerequisites Checklist`
- **Block 2**: Bulleted list with exactly 3 items:
- `✅ Advanced Python Features (Decorators, Context Managers)`
- `✅ API Integration and Async Basics`
- `✅ Pattern Matching and Text Processing`
- **Block 3**: Paragraph with text: `This lesson serves as a checkpoint before entering expert-level content. Ensure you have mastered all prerequisites listed above.`
================================================
FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/meta.json
================================================
{
"task_id": "expert_level_lessons",
"task_name": "Expert Level Lessons",
"category_id": "python_roadmap",
"category_name": "Python Roadmap",
"description": "Create an Expert Level chapter with sophisticated prerequisite chains and four expert-level lessons.",
"author": "Lingjun Chen",
"created_at": "2025-08-02",
"difficulty": "L3",
"tags": [
"database manipulation",
"cross-reference linking",
"conditional filtering",
"status tracking",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap"
}
}
================================================
FILE: tasks/notion/standard/python_roadmap/expert_level_lessons/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Expert Level chapter and its lessons have been created correctly with complex prerequisites.
"""
# Step 1: Find the main page and get database IDs
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != 'page':
print("Error: Main page not found.", file=sys.stderr)
return False
else:
# Try to find the main page by searching
found_id = notion_utils.find_page(notion, "Python Roadmap")
if not found_id:
print("Error: Main page not found.", file=sys.stderr)
return False
print(f"Found main page: {found_id}")
# Get all blocks from the page to find database references
all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
print(f"Found {len(all_blocks)} blocks")
# Find database IDs from the page
chapters_db_id = None
steps_db_id = None
for block in all_blocks:
if block and block.get("type") == "child_database":
db_title = block.get("child_database", {}).get("title", "")
if "Chapters" in db_title:
chapters_db_id = block["id"]
print(f"Found Chapters database: {chapters_db_id}")
elif "Steps" in db_title:
steps_db_id = block["id"]
print(f"Found Steps database: {steps_db_id}")
if not chapters_db_id:
print("Error: Chapters database not found.", file=sys.stderr)
return False
if not steps_db_id:
print("Error: Steps database not found.", file=sys.stderr)
return False
print("Starting verification...")
# Step 2: Verify the Expert Level chapter exists
print("2. Checking for Expert Level chapter...")
expert_chapter_id = None
try:
chapters_response = notion.databases.query(
database_id=chapters_db_id,
filter={
"property": "Name",
"title": {
"equals": "Expert Level"
}
}
)
if not chapters_response.get("results"):
print(f"Error: Expert Level chapter not found in Chapters database.", file=sys.stderr)
return False
expert_chapter = chapters_response["results"][0]
expert_chapter_id = expert_chapter["id"]
# Check chapter icon (purple circle)
chapter_icon = expert_chapter.get("icon")
if not chapter_icon or chapter_icon.get("type") != "emoji" or chapter_icon.get("emoji") != "🟣":
print(f"Error: Expert Level chapter does not have the correct purple circle emoji icon.", file=sys.stderr)
return False
print(f"✓ Expert Level chapter found with correct icon: 🟣")
except Exception as e:
print(f"Error querying Chapters database: {e}", file=sys.stderr)
return False
# Step 3: Find Control Flow lesson (In Progress status)
print("3. Finding Control Flow lesson...")
control_flow_id = None
try:
control_flow_response = notion.databases.query(
database_id=steps_db_id,
filter={
"and": [
{
"property": "Lessons",
"title": {
"contains": "Control"
}
},
{
"property": "Status",
"status": {
"equals": "Done" # Should be updated to Done
}
}
]
}
)
if control_flow_response.get("results"):
control_flow_lesson = control_flow_response["results"][0]
control_flow_id = control_flow_lesson["id"]
print(f"✓ Found Control Flow lesson with status 'Done'")
else:
print(f"Error: Control Flow lesson not found with status 'Done'.", file=sys.stderr)
return False
except Exception as e:
print(f"Error finding Control Flow lesson: {e}", file=sys.stderr)
return False
# Step 4: Find prerequisite lessons
print("4. Finding prerequisite lessons...")
decorators_id = None
calling_api_id = None
regex_id = None
try:
# Find Decorators (should be Done)
decorators_response = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"contains": "Decorators"
}
}
)
if decorators_response.get("results"):
decorators_lesson = decorators_response["results"][0]
decorators_id = decorators_lesson["id"]
# Check status is Done
if decorators_lesson["properties"]["Status"]["status"]["name"] != "Done":
print(f"Error: Decorators lesson should have status 'Done'.", file=sys.stderr)
return False
print(f"✓ Found Decorators lesson with status 'Done'")
else:
print(f"Error: Decorators lesson not found.", file=sys.stderr)
return False
# Find Calling API
calling_api_response = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"equals": "Calling API"
}
}
)
if calling_api_response.get("results"):
calling_api_lesson = calling_api_response["results"][0]
calling_api_id = calling_api_lesson["id"]
print(f"✓ Found Calling API lesson")
else:
print(f"Error: Calling API lesson not found.", file=sys.stderr)
return False
# Find Regular Expressions
regex_response = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"contains": "Regular Expressions"
}
}
)
if regex_response.get("results"):
regex_lesson = regex_response["results"][0]
regex_id = regex_lesson["id"]
print(f"✓ Found Regular Expressions lesson")
else:
print(f"Error: Regular Expressions lesson not found.", file=sys.stderr)
return False
except Exception as e:
print(f"Error finding prerequisite lessons: {e}", file=sys.stderr)
return False
# Step 5: Verify Advanced Foundations Review bridge lesson
print("5. Checking Advanced Foundations Review bridge lesson...")
bridge_id = None
try:
bridge_response = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"equals": "Advanced Foundations Review"
}
}
)
if not bridge_response.get("results"):
print(f"Error: Advanced Foundations Review lesson not found.", file=sys.stderr)
return False
bridge_lesson = bridge_response["results"][0]
bridge_id = bridge_lesson["id"]
# Check status is Done
if bridge_lesson["properties"]["Status"]["status"]["name"] != "Done":
print(f"Error: Advanced Foundations Review should have status 'Done'.", file=sys.stderr)
return False
# Check linked to Expert Level chapter
bridge_chapters = bridge_lesson["properties"]["Chapters"]["relation"]
if not any(rel["id"] == expert_chapter_id for rel in bridge_chapters):
print(f"Error: Advanced Foundations Review not linked to Expert Level chapter.", file=sys.stderr)
return False
# Check Parent item is Control Flow
bridge_parent = bridge_lesson["properties"]["Parent item"]["relation"]
if not bridge_parent or bridge_parent[0]["id"] != control_flow_id:
print(f"Error: Advanced Foundations Review should have Control Flow as Parent item.", file=sys.stderr)
return False
# Check Sub-items (should have at least 3 specific lessons plus any that reference it as parent)
bridge_subitems = bridge_lesson["properties"]["Sub-item"]["relation"]
required_subitems = {decorators_id, calling_api_id, regex_id}
actual_subitems = {item["id"] for item in bridge_subitems}
if not required_subitems.issubset(actual_subitems):
print(f"Error: Advanced Foundations Review should have at least these 3 sub-items: Decorators, Calling API, Regular Expressions.", file=sys.stderr)
return False
# Due to bidirectional relations, lessons that have this as parent will also appear as sub-items
# We expect at least 5: 3 initial + 2 that reference it as parent (Metaprogramming and Memory Management)
if len(bridge_subitems) < 5:
print(f"Error: Advanced Foundations Review should have at least 5 sub-items (3 initial + 2 from parent relations), found {len(bridge_subitems)}.", file=sys.stderr)
return False
print(f"✓ Advanced Foundations Review has {len(bridge_subitems)} sub-items, including the 3 required ones")
print(f"✓ Advanced Foundations Review found with correct properties")
except Exception as e:
print(f"Error checking bridge lesson: {e}", file=sys.stderr)
return False
# Step 6: Verify the 4 expert lessons
print("6. Checking the 4 expert lessons...")
# Note: Async Concurrency Patterns will have Error Handling as parent (due to sub-item relation)
# We'll need to find Error Handling's ID first
error_handling_response = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"equals": "Error Handling"
}
}
)
error_handling_id = None
if error_handling_response.get("results"):
error_handling_id = error_handling_response["results"][0]["id"]
else:
print(f"Error: Error Handling lesson not found.", file=sys.stderr)
return False
expert_lessons = {
"Metaprogramming and AST Manipulation": {
"status": "To Do",
"parent": bridge_id,
"date": "2025-09-15"
},
"Async Concurrency Patterns": {
"status": "To Do",
"parent": error_handling_id, # Parent is Error Handling due to sub-item relation
"date": "2025-09-20"
},
"Memory Management and GC Tuning": {
"status": "In Progress",
"parent": bridge_id,
"date": "2025-09-25"
},
"Building Python C Extensions": {
"status": "To Do",
"date": "2025-10-01"
}
}
lesson_ids = {}
try:
for lesson_name, expected in expert_lessons.items():
lesson_response = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"equals": lesson_name
}
}
)
if not lesson_response.get("results"):
print(f"Error: Lesson '{lesson_name}' not found.", file=sys.stderr)
return False
lesson = lesson_response["results"][0]
lesson_ids[lesson_name] = lesson["id"]
# Check status
if lesson["properties"]["Status"]["status"]["name"] != expected["status"]:
print(f"Error: Lesson '{lesson_name}' should have status '{expected['status']}'.", file=sys.stderr)
return False
# Check linked to Expert Level chapter
lesson_chapters = lesson["properties"]["Chapters"]["relation"]
if not any(rel["id"] == expert_chapter_id for rel in lesson_chapters):
print(f"Error: Lesson '{lesson_name}' not linked to Expert Level chapter.", file=sys.stderr)
return False
# Check date
lesson_date = lesson["properties"]["Date"]["date"]
if lesson_date and lesson_date.get("start") != expected["date"]:
print(f"Error: Lesson '{lesson_name}' should have date '{expected['date']}'.", file=sys.stderr)
return False
# Check parent item for lessons that have specific parent requirements
if "parent" in expected:
lesson_parent = lesson["properties"]["Parent item"]["relation"]
if not lesson_parent or lesson_parent[0]["id"] != expected["parent"]:
print(f"Error: Lesson '{lesson_name}' should have correct parent item.", file=sys.stderr)
return False
print(f"✓ Lesson '{lesson_name}' found with correct properties")
# Special checks for Building Python C Extensions parent relationship
# (other parent checks are handled in the loop above)
building_lesson = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"equals": "Building Python C Extensions"
}
}
)["results"][0]
building_parent = building_lesson["properties"]["Parent item"]["relation"]
if not building_parent or building_parent[0]["id"] != lesson_ids["Metaprogramming and AST Manipulation"]:
print(f"Error: Building Python C Extensions should have Metaprogramming and AST Manipulation as parent.", file=sys.stderr)
return False
# Memory Management should have 2 sub-items
memory_lesson = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"equals": "Memory Management and GC Tuning"
}
}
)["results"][0]
memory_subitems = memory_lesson["properties"]["Sub-item"]["relation"]
if len(memory_subitems) != 2:
print(f"Error: Memory Management and GC Tuning should have exactly 2 sub-items.", file=sys.stderr)
return False
except Exception as e:
print(f"Error checking expert lessons: {e}", file=sys.stderr)
return False
# Step 7: Verify Error Handling has Async Concurrency Patterns as sub-item
print("7. Checking Error Handling sub-item...")
try:
error_handling_response = notion.databases.query(
database_id=steps_db_id,
filter={
"property": "Lessons",
"title": {
"equals": "Error Handling"
}
}
)
if error_handling_response.get("results"):
error_handling_lesson = error_handling_response["results"][0]
error_subitems = error_handling_lesson["properties"]["Sub-item"]["relation"]
if not any(item["id"] == lesson_ids["Async Concurrency Patterns"] for item in error_subitems):
print(f"Error: Error Handling should have Async Concurrency Patterns as sub-item.", file=sys.stderr)
return False
print(f"✓ Error Handling has Async Concurrency Patterns as sub-item")
else:
print(f"Error: Error Handling lesson not found.", file=sys.stderr)
return False
except Exception as e:
print(f"Error checking Error Handling: {e}", file=sys.stderr)
return False
# Step 8: Verify block content in Advanced Foundations Review
print("8. Checking Advanced Foundations Review page content...")
try:
blocks = notion_utils.get_all_blocks_recursively(notion, bridge_id)
if len(blocks) < 3:
print(f"Error: Advanced Foundations Review should have at least 3 blocks.", file=sys.stderr)
return False
# Check Block 1: Heading 2
block1 = blocks[0]
if block1.get("type") != "heading_2":
print(f"Error: First block should be heading_2.", file=sys.stderr)
return False
heading_text = block1.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
if heading_text != "Prerequisites Checklist":
print(f"Error: Heading should be 'Prerequisites Checklist'.", file=sys.stderr)
return False
# Check Block 2: Bulleted list
block2 = blocks[1]
if block2.get("type") != "bulleted_list_item":
print(f"Error: Second block should be bulleted_list_item.", file=sys.stderr)
return False
# Check Block 3 and 4 are also bulleted list items
if len(blocks) >= 4:
block3 = blocks[2]
block4 = blocks[3]
if block3.get("type") != "bulleted_list_item" or block4.get("type") != "bulleted_list_item":
print(f"Error: Blocks 2-4 should be bulleted list items.", file=sys.stderr)
return False
# Check last block is paragraph
last_block = blocks[-1]
if last_block.get("type") != "paragraph":
print(f"Error: Last block should be paragraph.", file=sys.stderr)
return False
paragraph_text = last_block.get("paragraph", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
if "checkpoint" not in paragraph_text.lower():
print(f"Error: Paragraph should contain text about checkpoint.", file=sys.stderr)
return False
print(f"✓ Advanced Foundations Review page has correct content structure")
except Exception as e:
print(f"Error checking page content: {e}", file=sys.stderr)
return False
# Step 9: Final verification counts
print("9. Verifying final state counts...")
try:
# Count total lessons by status
all_lessons = notion.databases.query(database_id=steps_db_id, page_size=100)["results"]
done_lessons = [l for l in all_lessons if l["properties"]["Status"]["status"]["name"] == "Done"]
done_count = len(done_lessons)
in_progress_count = sum(1 for l in all_lessons if l["properties"]["Status"]["status"]["name"] == "In Progress")
# Print out all Done lessons for debugging
if done_count != 14:
print(f"Found {done_count} Done lessons (expected 14):", file=sys.stderr)
for lesson in done_lessons:
lesson_name = lesson["properties"]["Lessons"]["title"][0]["text"]["content"]
print(f" - {lesson_name}", file=sys.stderr)
return False
if in_progress_count != 1:
print(f"Error: Should have 1 In Progress lesson, found {in_progress_count}.", file=sys.stderr)
return False
# Verify Expert Level has 5 lessons
expert_chapter_updated = notion.databases.query(
database_id=chapters_db_id,
filter={
"property": "Name",
"title": {
"equals": "Expert Level"
}
}
)["results"][0]
expert_steps = expert_chapter_updated["properties"]["Steps"]["relation"]
if len(expert_steps) != 5:
print(f"Error: Expert Level should have exactly 5 lessons, found {len(expert_steps)}.", file=sys.stderr)
return False
print(f"✓ Final state counts are correct")
except Exception as e:
print(f"Error verifying final counts: {e}", file=sys.stderr)
return False
print("🎉 All verification checks passed!")
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/description.md
================================================
# Task: Learning Metrics Dashboard
## Objective
Create a comprehensive Learning Metrics Dashboard section in the Python Roadmap page that displays precise statistics and recommendations based on the Steps database content.
## Requirements
### 1. Section Placement
- Add new content immediately after the Learning Materials section (before `Whether you're starting from scratch or`).
### 2. Dashboard Header
- **Type**: heading_3
- **Text**: `📊 Learning Metrics Dashboard`
### 3. Course Statistics Block
- **Type**: callout
- **Background Color**: Brown
- **Icon**: None
- **Title**: **Course Statistics** (bold, heading_3). Use the same color scheme as other callout headings.
- **Content**: Bulleted list with the following items in exact order:
- `Total Lessons: [X]` (count all entries in Steps database)
- `Completed: [X] ([Y]%)` (count Status="Done", calculate percentage to 1 decimal)
- `In Progress: [X] ([Y]%)` (count Status="In Progress", calculate percentage to 1 decimal)
- `Beginner Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Beginner Level)
- `Intermediate Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Intermediate Level)
- `Advanced Level: [X] lessons ([Y] completed)` (filter by Chapters relation to Advanced Level)
### 4. Completed Topics Section
- **Type**: toggle
- **Text**: `🏆 Completed Topics (Click to expand)`
- **Nested Content**: Numbered list containing exactly 5 items
- List lessons with Status="Done"
================================================
FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/meta.json
================================================
{
"task_id": "learning_metrics_dashboard",
"task_name": "Learning Metrics Dashboard",
"category_id": "python_roadmap",
"category_name": "Python Roadmap",
"description": "Create a comprehensive Learning Metrics Dashboard section displaying precise statistics and recommendations based on the Steps database.",
"author": "Lingjun Chen",
"created_at": "2025-08-02",
"difficulty": "L3",
"tags": [
"data aggregation",
"conditional filtering",
"report generation",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Python-Roadmap-25281626b6d78012bf2bce1fa8711f4d",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/python-roadmap"
}
}
================================================
FILE: tasks/notion/standard/python_roadmap/learning_metrics_dashboard/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def get_page_title_from_result(page_result):
"""
Extract the title from a page result object from database query.
"""
properties = page_result.get('properties', {})
# Try common title property names
for prop_name in ['Name', 'Title', 'title', 'Lessons']:
if prop_name in properties:
prop = properties[prop_name]
if prop.get('type') == 'title':
title_array = prop.get('title', [])
if title_array and len(title_array) > 0:
return title_array[0].get('plain_text', '')
return ''
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Learning Metrics Dashboard has been implemented correctly according to description.md.
"""
# Step 1: Find the main page and get all blocks
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != 'page':
print("Error: Main page not found.", file=sys.stderr)
return False
else:
# Try to find the main page by searching
found_id = notion_utils.find_page(notion, "Python Roadmap")
if not found_id:
print("Error: Main page not found.", file=sys.stderr)
return False
print(f"Found main page: {found_id}")
# Get Steps database to calculate expected statistics
steps_db_id = notion_utils.find_database(notion, "Steps")
if not steps_db_id:
print("Error: Steps database not found.", file=sys.stderr)
return False
# Query Steps database to get all lessons
steps_data = notion.databases.query(database_id=steps_db_id)
total_lessons = len(steps_data['results'])
completed_count = 0
in_progress_count = 0
completed_lessons = []
# Get Chapters database for level information
chapters_db_id = notion_utils.find_database(notion, "Chapters")
if not chapters_db_id:
print("Error: Chapters database not found.", file=sys.stderr)
return False
# Query Chapters database to get level information
chapters_data = notion.databases.query(database_id=chapters_db_id)
level_ids = {
'Beginner Level': None,
'Intermediate Level': None,
'Advanced Level': None
}
for chapter in chapters_data['results']:
chapter_name = get_page_title_from_result(chapter)
if chapter_name in level_ids:
level_ids[chapter_name] = chapter['id']
# Initialize level counts
level_counts = {
'Beginner Level': {'total': 0, 'completed': 0},
'Intermediate Level': {'total': 0, 'completed': 0},
'Advanced Level': {'total': 0, 'completed': 0}
}
# Count lessons by status and level
for lesson in steps_data['results']:
status = lesson['properties']['Status']['status']
if status and status['name'] == 'Done':
completed_count += 1
lesson_title = get_page_title_from_result(lesson)
if lesson_title:
completed_lessons.append(lesson_title)
elif status and status['name'] == 'In Progress':
in_progress_count += 1
# Count by level
chapters_relation = lesson['properties']['Chapters']['relation']
for chapter_ref in chapters_relation:
chapter_id = chapter_ref['id']
for level_name, level_id in level_ids.items():
if chapter_id == level_id:
level_counts[level_name]['total'] += 1
if status and status['name'] == 'Done':
level_counts[level_name]['completed'] += 1
# Calculate percentages
completed_percentage = round((completed_count / total_lessons * 100), 1) if total_lessons > 0 else 0
in_progress_percentage = round((in_progress_count / total_lessons * 100), 1) if total_lessons > 0 else 0
print(f"Expected statistics:")
print(f" Total Lessons: {total_lessons}")
print(f" Completed: {completed_count} ({completed_percentage}%)")
print(f" In Progress: {in_progress_count} ({in_progress_percentage}%)")
print(f" Beginner Level: {level_counts['Beginner Level']['total']} lessons ({level_counts['Beginner Level']['completed']} completed)")
print(f" Intermediate Level: {level_counts['Intermediate Level']['total']} lessons ({level_counts['Intermediate Level']['completed']} completed)")
print(f" Advanced Level: {level_counts['Advanced Level']['total']} lessons ({level_counts['Advanced Level']['completed']} completed)")
print(f" Completed lessons (first 5): {completed_lessons[:5]}")
# Get all blocks from the page
all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
print(f"Found {len(all_blocks)} blocks")
# Step 2: Verify the required elements in order
learning_materials_idx = -1
dashboard_heading_idx = -1
callout_idx = -1
toggle_idx = -1
whether_paragraph_idx = -1 # Track the "Whether you're starting from scratch" paragraph
# Track what we've verified
callout_has_brown_bg = False
callout_has_no_icon = False
callout_has_course_statistics_title = False
callout_title_has_correct_colors = False
statistics_items_found = []
completed_topics_found = []
# Expected statistics content
expected_statistics = [
f"Total Lessons: {total_lessons}",
f"Completed: {completed_count} ({completed_percentage}%)",
f"In Progress: {in_progress_count} ({in_progress_percentage}%)",
f"Beginner Level: {level_counts['Beginner Level']['total']} lessons ({level_counts['Beginner Level']['completed']} completed)",
f"Intermediate Level: {level_counts['Intermediate Level']['total']} lessons ({level_counts['Intermediate Level']['completed']} completed)",
f"Advanced Level: {level_counts['Advanced Level']['total']} lessons ({level_counts['Advanced Level']['completed']} completed)"
]
# Check blocks in order
for i, block in enumerate(all_blocks):
if block is None:
continue
block_type = block.get("type")
# 1. Check for Learning Materials heading (requirement 1)
if learning_materials_idx == -1 and block_type == "heading_3":
block_text = notion_utils.get_block_plain_text(block)
if "🎓 Learning Materials" in block_text or "Learning Materials" in block_text:
learning_materials_idx = i
print(f"✓ Requirement 1: Found Learning Materials heading at position {i}")
# 2. Check for Learning Metrics Dashboard heading after Learning Materials (requirement 2)
elif learning_materials_idx != -1 and dashboard_heading_idx == -1 and block_type == "heading_3":
block_text = notion_utils.get_block_plain_text(block)
if "📊 Learning Metrics Dashboard" in block_text:
dashboard_heading_idx = i
print(f"✓ Requirement 2: Found Learning Metrics Dashboard heading at position {i}")
# 3. Check for callout block after Dashboard heading (requirement 3)
elif dashboard_heading_idx != -1 and callout_idx == -1 and block_type == "callout":
callout_idx = i
print(f" Found callout block at position {i}")
# Check brown background (requirement 3.1)
if block.get("callout", {}).get("color") == "brown_background":
callout_has_brown_bg = True
print(f" ✓ Requirement 3.1: Callout has brown background")
# Check no icon (requirement 3.2)
icon = block.get("callout", {}).get("icon")
if icon is None:
callout_has_no_icon = True
print(f" ✓ Requirement 3.2: Callout has no icon")
# Get nested blocks for Course Statistics title and content
nested_blocks = notion_utils.get_all_blocks_recursively(notion, block.get("id"))
for nested in nested_blocks:
# Check for heading_3 only as per requirement
if nested and nested.get("type") == "heading_3":
# Check for "Course Statistics" title with correct formatting
rich_text = nested.get("heading_3", {}).get("rich_text", [])
course_found = False
course_correct = False
statistics_found = False
statistics_correct = False
for text_item in rich_text:
text_content = text_item.get("text", {}).get("content", "")
annotations = text_item.get("annotations", {})
color = annotations.get("color", "default")
is_bold = annotations.get("bold", False)
if "Course" in text_content:
course_found = True
# Check if Course is blue and bold
if color == "blue" and is_bold:
course_correct = True
print(f" ✓ 'Course' has blue color and is bold")
else:
print(f" ✗ 'Course' color: {color}, bold: {is_bold} (should be blue and bold)")
if "Statistics" in text_content:
statistics_found = True
# Check if Statistics is yellow and bold
if color == "yellow" and is_bold:
statistics_correct = True
print(f" ✓ 'Statistics' has yellow color and is bold")
else:
print(f" ✗ 'Statistics' color: {color}, bold: {is_bold} (should be yellow and bold)")
if course_found and statistics_found:
callout_has_course_statistics_title = True
if course_correct and statistics_correct:
callout_title_has_correct_colors = True
print(f" ✓ Requirement 3.3: Callout has 'Course Statistics' title with correct colors")
else:
print(f" ✗ Requirement 3.3: Title found but colors/formatting incorrect")
# Check for statistics items in bulleted list
elif nested and nested.get("type") == "bulleted_list_item":
item_text = notion_utils.get_block_plain_text(nested)
for expected_item in expected_statistics:
if expected_item in item_text:
if expected_item not in statistics_items_found:
statistics_items_found.append(expected_item)
print(f" ✓ Requirement 3.4: Found statistics item: {expected_item}")
# 4. Check for Completed Topics toggle after callout (requirement 4)
elif callout_idx != -1 and toggle_idx == -1 and block_type == "toggle":
block_text = notion_utils.get_block_plain_text(block)
if "🏆 Completed Topics (Click to expand)" in block_text:
toggle_idx = i
print(f"✓ Requirement 4: Found Completed Topics toggle at position {i}")
# Get nested blocks for completed topics list
nested_blocks = notion_utils.get_all_blocks_recursively(notion, block.get("id"))
for nested in nested_blocks:
if nested and nested.get("type") == "numbered_list_item":
item_text = notion_utils.get_block_plain_text(nested)
if item_text and item_text in completed_lessons:
completed_topics_found.append(item_text)
print(f" ✓ Requirement 4.1: Found completed topic: {item_text}")
# 5. Check for "Whether you're starting from scratch" paragraph (should be after dashboard content)
elif block_type == "paragraph" and whether_paragraph_idx == -1:
block_text = notion_utils.get_block_plain_text(block)
if "Whether you're starting from scratch" in block_text or "Whether you're starting from scratch" in block_text:
whether_paragraph_idx = i
print(f" Found 'Whether you're starting from scratch' paragraph at position {i}")
# Step 3: Verify all requirements were met
print(f"\nVerification Summary:")
all_passed = True
# Requirement 1: Learning Materials section found
if learning_materials_idx == -1:
print("✗ Requirement 1: Learning Materials section NOT found", file=sys.stderr)
all_passed = False
else:
print("✓ Requirement 1: Learning Materials section found")
# Requirement 2: Learning Metrics Dashboard heading after Learning Materials and before "Whether..." paragraph
if dashboard_heading_idx == -1:
print("✗ Requirement 2: Learning Metrics Dashboard heading NOT found", file=sys.stderr)
all_passed = False
elif dashboard_heading_idx <= learning_materials_idx:
print("✗ Requirement 2: Learning Metrics Dashboard heading not AFTER Learning Materials", file=sys.stderr)
all_passed = False
elif whether_paragraph_idx != -1 and dashboard_heading_idx >= whether_paragraph_idx:
print("✗ Requirement 2: Learning Metrics Dashboard heading not BEFORE 'Whether you're starting from scratch' paragraph", file=sys.stderr)
all_passed = False
else:
print("✓ Requirement 2: Learning Metrics Dashboard heading found after Learning Materials")
if whether_paragraph_idx != -1:
print(" ✓ Dashboard content is correctly placed before 'Whether you're starting from scratch' paragraph")
# Requirement 3: Course Statistics callout block with all specifications
if callout_idx == -1:
print("✗ Requirement 3: Course Statistics callout block NOT found", file=sys.stderr)
all_passed = False
else:
if not callout_has_brown_bg:
print("✗ Requirement 3.1: Callout does NOT have brown background", file=sys.stderr)
all_passed = False
else:
print("✓ Requirement 3.1: Callout has brown background")
if not callout_has_no_icon:
print("✗ Requirement 3.2: Callout has an icon (should have none)", file=sys.stderr)
all_passed = False
else:
print("✓ Requirement 3.2: Callout has no icon")
if not callout_has_course_statistics_title:
print("✗ Requirement 3.3: Callout does NOT have 'Course Statistics' title", file=sys.stderr)
all_passed = False
else:
print("✓ Requirement 3.3: Callout has 'Course Statistics' title")
if not callout_title_has_correct_colors:
print("✗ Requirement 3.3.1: Title does NOT have correct colors (blue for Course, yellow for Statistics)", file=sys.stderr)
all_passed = False
else:
print("✓ Requirement 3.3.1: Title has correct colors")
# Check all statistics items
missing_items = [item for item in expected_statistics if item not in statistics_items_found]
if missing_items:
print(f"✗ Requirement 3.4: Missing statistics items: {missing_items}", file=sys.stderr)
all_passed = False
else:
print("✓ Requirement 3.4: All 6 statistics items found")
# Requirement 4: Completed Topics toggle
if toggle_idx == -1:
print("✗ Requirement 4: Completed Topics toggle NOT found", file=sys.stderr)
all_passed = False
elif toggle_idx <= callout_idx:
print("✗ Requirement 4: Completed Topics toggle not AFTER callout", file=sys.stderr)
all_passed = False
else:
print("✓ Requirement 4: Completed Topics toggle found after callout")
# Check that exactly 5 completed topics are listed
if len(completed_topics_found) != 5:
if len(completed_topics_found) < 5:
print(f"✗ Requirement 4.1: Only {len(completed_topics_found)} completed topics found (need exactly 5)", file=sys.stderr)
else:
print(f"✗ Requirement 4.1: Found {len(completed_topics_found)} completed topics (need exactly 5, not more)", file=sys.stderr)
all_passed = False
else:
print(f"✓ Requirement 4.1: Found exactly 5 completed topics as required")
# Requirement 5: Proper integration (implicitly checked by order)
if all_passed:
print("✓ Requirement 5: All content properly integrated in correct order")
return all_passed
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
print("Verification passed")
sys.exit(0)
else:
print("Verification failed")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/self_assessment/faq_column_layout/description.md
================================================
Navigate to the "Self Assessment" page and reorganize the content under the FAQ toggle as follows:
**Task Requirements:**
1. Add a column list with two columns inside the FAQ toggle
2. Move the first two existing Q&A pairs from the FAQ to the left column
3. Move the third existing Q&A pair to the right column
4. Add one additional Q&A pair in the right column to match the format, so both columns have exactly 2 Q&A pairs
5. Ensure all Q&A pairs maintain consistent formatting (heading_3 for questions, paragraph for answers)
================================================
FILE: tasks/notion/standard/self_assessment/faq_column_layout/meta.json
================================================
{
"task_id": "faq_column_layout",
"task_name": "FAQ Column Layout",
"category_id": "self_assessment",
"category_name": "Self Assessment",
"description": "Reorganize the FAQ section content into a two-column layout with balanced Q&A pairs.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"content organization",
"visual formatting",
"template population"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d",
"stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d"
}
}
================================================
FILE: tasks/notion/standard/self_assessment/faq_column_layout/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the FAQ toggle has been properly reorganized with a column list.
"""
# Start from main_id if provided
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
# Try to find the Self Assessment page
page_id = notion_utils.find_page(notion, "Self Assessment")
if not page_id:
print("Error: Self Assessment page not found.", file=sys.stderr)
return False
# Get all blocks recursively from the page
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
# Find the FAQ toggle block
faq_toggle_block = None
faq_toggle_id = None
for block in all_blocks:
if block.get("type") == "toggle":
block_text = notion_utils.get_block_plain_text(block)
if "FAQ" in block_text:
faq_toggle_block = block
faq_toggle_id = block.get("id")
print(f"Found FAQ toggle block: {block_text}")
break
if not faq_toggle_block:
print("Error: FAQ toggle block not found.", file=sys.stderr)
return False
# Find column_list inside the FAQ toggle
column_list_block = None
for block in all_blocks:
if (
block.get("type") == "column_list"
and block.get("parent", {}).get("block_id") == faq_toggle_id
):
column_list_block = block
break
if not column_list_block:
print("Error: No column_list found inside FAQ toggle.", file=sys.stderr)
return False
# Check that there are no Q&A pairs directly under FAQ toggle (outside column_list)
direct_faq_children = []
for block in all_blocks:
if block.get("parent", {}).get("block_id") == faq_toggle_id and block.get(
"id"
) != column_list_block.get("id"):
direct_faq_children.append(block)
# Check if any of these are heading_3 or paragraph blocks (Q&A content)
for block in direct_faq_children:
if block.get("type") in ["heading_3", "paragraph"]:
print(
f"Error: Found Q&A content outside column_list: {notion_utils.get_block_plain_text(block)[:50]}...",
file=sys.stderr,
)
return False
# Find the two columns
columns = []
column_list_id = column_list_block.get("id")
for block in all_blocks:
if (
block.get("type") == "column"
and block.get("parent", {}).get("block_id") == column_list_id
):
columns.append(block)
if len(columns) != 2:
print(f"Error: Expected 2 columns, found {len(columns)}.", file=sys.stderr)
return False
# Check each column has exactly 2 Q&A pairs
for i, column in enumerate(columns):
column_id = column.get("id")
# Find blocks inside this column
column_blocks = []
for block in all_blocks:
if block.get("parent", {}).get("block_id") == column_id:
column_blocks.append(block)
# Count Q&A pairs (should be heading_3 followed by paragraph)
qa_pairs = 0
j = 0
while j < len(column_blocks):
if (
column_blocks[j].get("type") == "heading_3"
and j + 1 < len(column_blocks)
and column_blocks[j + 1].get("type") == "paragraph"
):
qa_pairs += 1
j += 2 # Skip both question and answer
else:
j += 1
if qa_pairs != 2:
print(
f"Error: Column {i + 1} has {qa_pairs} Q&A pairs, expected 2.",
file=sys.stderr,
)
return False
print(f"Column {i + 1}: Found {qa_pairs} Q&A pairs ✓")
print(
"Success: FAQ toggle properly organized with 2 columns, each containing 2 Q&A pairs."
)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/description.md
================================================
Go to my Self Assessment page, and then create a hyperfocus analysis report by analyzing sessions with high productivity but significant challenges.
**Task Requirements:**
1. Create a new page titled "Hyperfocus Analysis Report" as a child of the Self Assessment page. The new page should be located between 'Why Use the Term "Hyperfocus"?' callout and the following divider line.
2. Query the "Hyperfocus Self-Assessment Worksheet" database to find all sessions where:
- Work Completion Rate is greater than 80% (0.8)
- At least one challenge is present in the Challenges field
3. For each qualifying session, create a section with:
- A heading showing the date and activity type (format: YYYY-MM-DD Activity)
- A bullet list containing:
- Focus factors used (e.g., Focus factors: XXX, YYY)
- Energy level and mood (format: "Energy: X/10, Mood: Y/10")
- Challenges faced (e.g., Challenges: XXX, YYY)
- Strategies that helped overcome challenges (e.g., Strategies: XXX, YYY)
- Work completion rate (format: "Completion: XX%")
4. At the top of the page, add a callout block (type: "info") with:
- Title: "Top 2 Most Effective Strategies"
- Content: List the 2 most frequently used strategies from all sessions, each on a new line with format "• Strategy Name (used in X sessions)"
**Structure Requirements:**
- The page must have the exact title "Hyperfocus Analysis Report"
- Each session section must start with a level 2 heading
- All session details must be in bullet point format
- The summary callout must be at the top of the page before any session details
================================================
FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/meta.json
================================================
{
"task_id": "hyperfocus_analysis_report",
"task_name": "Hyperfocus Analysis Report",
"category_id": "self_assessment",
"category_name": "Self Assessment",
"description": "Create a hyperfocus analysis report by analyzing high-productivity sessions with challenges.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"conditional filtering",
"data aggregation",
"report generation",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d",
"stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d"
}
}
================================================
FILE: tasks/notion/standard/self_assessment/hyperfocus_analysis_report/verify.py
================================================
import sys
import re
from notion_client import Client
from tasks.utils import notion_utils
from collections import Counter
def validate_comma_separated(text: str, expected_items: list) -> bool:
"""
Validates that a comma-separated list contains expected items (case-insensitive).
"""
if not text or not expected_items:
return False
# Extract items from text
items = [item.strip().lower() for item in text.split(",")]
expected_lower = [item.lower() for item in expected_items]
# Check if all expected items are present
for expected in expected_lower:
if not any(expected in item or item in expected for item in items):
return False
return True
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Hyperfocus Analysis Report has been created correctly.
"""
# Find the Self Assessment page
self_assessment_page_id = main_id
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
self_assessment_page_id = found_id
if not self_assessment_page_id:
# Try to find by name
self_assessment_page_id = notion_utils.find_page(notion, "Self Assessment")
if not self_assessment_page_id:
print("Error: Self Assessment page not found.", file=sys.stderr)
return False
# Find the Hyperfocus Analysis Report page
report_page_id = None
report_position = -1
callout_position = -1
divider_position = -1
children = notion.blocks.children.list(block_id=self_assessment_page_id).get(
"results", []
)
for i, child in enumerate(children):
# Track position of callout with "Why Use the Term"
if child.get("type") == "callout":
callout_text = notion_utils.get_block_plain_text(child)
if "Why Use the Term" in callout_text and "Hyperfocus" in callout_text:
callout_position = i
# Track position of divider
elif child.get("type") == "divider":
if callout_position != -1 and divider_position == -1:
divider_position = i
# Find the report page
elif child.get("type") == "child_page":
page_data = notion.pages.retrieve(page_id=child["id"])
title_prop = (
page_data.get("properties", {}).get("title", {}).get("title", [])
)
if (
title_prop
and title_prop[0].get("plain_text") == "Hyperfocus Analysis Report"
):
report_page_id = child["id"]
report_position = i
if not report_page_id:
print("Error: 'Hyperfocus Analysis Report' page not found.", file=sys.stderr)
return False
# Verify position
if callout_position == -1:
print(
"Error: Could not find 'Why Use the Term \"Hyperfocus\"?' callout.",
file=sys.stderr,
)
return False
if divider_position == -1:
print("Error: Could not find divider after the callout.", file=sys.stderr)
return False
if not (callout_position < report_position < divider_position):
print(
f"Error: Report page is not positioned between callout and divider. Positions: callout={callout_position}, report={report_position}, divider={divider_position}",
file=sys.stderr,
)
return False
# Get all blocks from the report page
all_blocks = notion_utils.get_all_blocks_recursively(notion, report_page_id)
# Find the database in the Self Assessment page
database_id = None
for block in notion_utils.get_all_blocks_recursively(
notion, self_assessment_page_id
):
if block.get("type") == "child_database":
db_data = notion.databases.retrieve(database_id=block["id"])
db_title = "".join(
[t.get("plain_text", "") for t in db_data.get("title", [])]
)
if "Hyperfocus Self-Assessment Worksheet" in db_title:
database_id = block["id"]
break
if not database_id:
print(
"Error: Database 'Hyperfocus Self-Assessment Worksheet' not found.",
file=sys.stderr,
)
return False
# Query database for sessions with >80% completion rate and challenges
query_results = notion.databases.query(
database_id=database_id,
filter={
"and": [
{"property": "Work Completion Rate", "number": {"greater_than": 0.8}},
{"property": "Challenges", "multi_select": {"is_not_empty": True}},
]
},
).get("results", [])
if not query_results:
print(
"Warning: No sessions found with >80% completion rate and challenges.",
file=sys.stderr,
)
# Still check if the page structure is correct
# Verify page structure
has_callout = False
has_top_strategies = False
session_count = 0
found_sessions = {} # Track sessions by date for validation
# Track strategies for validation - count from ALL sessions
all_sessions = notion.databases.query(database_id=database_id).get("results", [])
all_strategies = []
for session in all_sessions:
strategies = (
session.get("properties", {})
.get("Key Strategies Used", {})
.get("multi_select", [])
)
all_strategies.extend([s.get("name") for s in strategies])
strategy_counts = Counter(all_strategies)
top_2_strategies = strategy_counts.most_common(2)
# Build expected sessions from query results with all data
expected_sessions = {}
for result in query_results:
date_prop = result.get("properties", {}).get("Date", {}).get("date", {})
activity_prop = (
result.get("properties", {}).get("Activity", {}).get("select", {})
)
if date_prop and date_prop.get("start") and activity_prop:
date_str = date_prop["start"]
activity_name = activity_prop.get("name", "")
# Extract all session data for validation
focus_factors = [
f.get("name", "")
for f in result.get("properties", {})
.get("Focus Factors", {})
.get("multi_select", [])
]
challenges = [
c.get("name", "")
for c in result.get("properties", {})
.get("Challenges", {})
.get("multi_select", [])
]
strategies = [
s.get("name", "")
for s in result.get("properties", {})
.get("Key Strategies Used", {})
.get("multi_select", [])
]
energy = result.get("properties", {}).get("Energy Level", {}).get("number")
mood = result.get("properties", {}).get("Mood", {}).get("number")
completion = (
result.get("properties", {})
.get("Work Completion Rate", {})
.get("number")
)
expected_sessions[date_str] = {
"activity": activity_name,
"focus_factors": focus_factors,
"challenges": challenges,
"strategies": strategies,
"energy": energy,
"mood": mood,
"completion": completion,
}
current_session_date = None
current_session_data = None
session_bullet_points = {} # Track bullet points for each session
for i, block in enumerate(all_blocks):
block_type = block.get("type")
# Check for callout at the top
if block_type == "callout" and i < 5: # Should be near the top
callout_text = notion_utils.get_block_plain_text(block)
if "Top 2 Most Effective Strategies" in callout_text:
has_callout = True
# Check if it contains strategy information
s1, n1 = top_2_strategies[0]
s2, n2 = top_2_strategies[1]
t1 = f"{s1} (used in {n1} sessions)"
t2 = f"{s2} (used in {n2} sessions)"
if t1 in callout_text and t2 in callout_text:
has_top_strategies = True
break
# Check for session headings with format YYYY-MM-DD Activity
if block_type == "heading_2":
heading_text = notion_utils.get_block_plain_text(block)
# Check if heading matches expected format
for date_str, session_data in expected_sessions.items():
activity = session_data["activity"]
expected_heading = f"{date_str} {activity}"
if expected_heading in heading_text:
found_sessions[date_str] = session_data
session_count += 1
current_session_date = date_str
current_session_data = session_data
session_bullet_points[date_str] = []
break
# Check for bullet points with session details
if block_type == "bulleted_list_item" and current_session_data:
bullet_text = notion_utils.get_block_plain_text(block)
# Track bullet points for current session
if current_session_date:
session_bullet_points[current_session_date].append(bullet_text)
# Validate specific bullet point content
if bullet_text.startswith("Focus factors"):
content = bullet_text.split(":", 1)[1].strip()
expected_factors = current_session_data.get("focus_factors", [])
if not validate_comma_separated(content, expected_factors):
print(
f"Error: Focus factors mismatch for {current_session_date}. Expected: {expected_factors}, Found: {content}",
file=sys.stderr,
)
return False
elif "Energy" in bullet_text and "Mood" in bullet_text:
# Extract energy and mood values
energy_match = re.search(r"Energy:\s*(\d+)/10", bullet_text)
mood_match = re.search(r"Mood:\s*(\d+)/10", bullet_text)
if energy_match and mood_match:
found_energy = int(energy_match.group(1))
found_mood = int(mood_match.group(1))
expected_energy = current_session_data.get("energy")
expected_mood = current_session_data.get("mood")
if found_energy != expected_energy or found_mood != expected_mood:
print(
f"Error: Energy/Mood mismatch for {current_session_date}. Expected: Energy: {expected_energy}/10, Mood: {expected_mood}/10",
file=sys.stderr,
)
return False
else:
print(
f"Error: Invalid Energy/Mood format for {current_session_date}",
file=sys.stderr,
)
return False
elif bullet_text.startswith("Challenges"):
content = bullet_text.split(":", 1)[1].strip()
expected_challenges = current_session_data.get("challenges", [])
if not validate_comma_separated(content, expected_challenges):
print(
f"Error: Challenges mismatch for {current_session_date}. Expected: {expected_challenges}, Found: {content}",
file=sys.stderr,
)
return False
elif bullet_text.startswith("Strategies"):
content = bullet_text.split(":", 1)[1].strip()
expected_strategies = current_session_data.get("strategies", [])
if len(expected_strategies) > 0 and not validate_comma_separated(
content, expected_strategies
):
print(
f"Error: Strategies mismatch for {current_session_date}. Expected: {expected_strategies}, Found: {content}",
file=sys.stderr,
)
return False
elif bullet_text.startswith("Completion"):
# Extract completion percentage
completion_match = re.search(r"Completion:\s*(\d+)%", bullet_text)
if completion_match:
found_completion = int(completion_match.group(1))
expected_completion = int(
current_session_data.get("completion", 0) * 100
)
if found_completion != expected_completion:
print(
f"Error: Completion rate mismatch for {current_session_date}. Expected: {expected_completion}%, Found: {found_completion}%",
file=sys.stderr,
)
return False
else:
print(
f"Error: Invalid completion format for {current_session_date}",
file=sys.stderr,
)
return False
# Verify all sessions have complete bullet points
for date_str, bullets in session_bullet_points.items():
bullets_text = " ".join(bullets)
required_items = [
"Focus factors",
"Energy:",
"Mood:",
"Challenges",
"Strategies",
"Completion",
]
missing_items = []
for item in required_items:
if item not in bullets_text:
missing_items.append(item)
if missing_items:
print(
f"Error: Missing bullet points for session {date_str}: {', '.join(missing_items)}",
file=sys.stderr,
)
return False
# Verify all requirements
if not has_callout:
print(
"Error: Missing callout block with 'Top 2 Most Effective Strategies'.",
file=sys.stderr,
)
return False
if not has_top_strategies and len(top_2_strategies) > 0:
print("Error: Callout doesn't contain strategy information.", file=sys.stderr)
return False
if query_results and session_count == 0:
print("Error: No session sections found with proper headings.", file=sys.stderr)
return False
# Check if all expected sessions are present
missing_sessions = []
for date_str in expected_sessions.keys():
if date_str not in found_sessions:
missing_sessions.append(date_str)
if missing_sessions:
print(
f"Error: Missing session sections for dates: {', '.join(missing_sessions)}",
file=sys.stderr,
)
return False
if query_results and session_count < len(query_results):
print(
f"Warning: Found {session_count} session sections but expected {len(query_results)}.",
file=sys.stderr,
)
print(
"Success: Hyperfocus Analysis Report created with proper structure and content."
)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/description.md
================================================
Please find all numbered list items in the Self Assessment page, use Notion tools to replace the numbers with corresponding emoji numbers (e.g., 1️⃣, 2️⃣, 3️⃣). For example:
Here is the translated and reformatted version of your request:
If the original numbered list is:
1. First step
2. Second step
3. Third step
It should become:
1️⃣ First step
2️⃣ Second step
3️⃣ Third step
================================================
FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/meta.json
================================================
{
"task_id": "numbered_list_emojis",
"task_name": "Numbered List Emojis",
"category_id": "self_assessment",
"category_name": "Self Assessment",
"description": "Replace numbered list items with corresponding emoji numbers for better visual formatting.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"visual formatting",
"automated migration"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d",
"stateOriginalUrl": "https://painted-tennis-ebc.notion.site/Self-Assessment-24381626b6d780fe9f56c2ba14ea042d"
}
}
================================================
FILE: tasks/notion/standard/self_assessment/numbered_list_emojis/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that numbered lists have been replaced with emoji numbers.
"""
# Start from main_id if provided, otherwise search for the page
self_assessment_page_id = main_id
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
self_assessment_page_id = found_id
if not self_assessment_page_id:
# Try to find by name
self_assessment_page_id = notion_utils.find_page(notion, "Self Assessment")
if not self_assessment_page_id:
print("Error: Self Assessment page not found.", file=sys.stderr)
return False
# Get all blocks recursively from the main page
all_blocks = notion_utils.get_all_blocks_recursively(
notion, self_assessment_page_id
)
# Find all numbered_list_item blocks
numbered_list_items = []
for block in all_blocks:
if block.get("type") == "numbered_list_item":
numbered_list_items.append(block)
if len(numbered_list_items) > 0:
print(
f"Error: found {len(numbered_list_items)} numbered list items that should be converted to emoji numbers",
file=sys.stderr,
)
# return False
required_items = [
"1️⃣ Record Each Hyperfocus Session:",
"2️⃣ Review and Reflect:",
"3️⃣ Adjust and Optimize:",
'1️⃣ Harvard Business Review: "The Making of a Corporate Athlete"',
'2️⃣ "Hyperfocus: How to Be More Productive in a World of Distraction" by Chris Bailey',
'3️⃣ "Attention Management: How to Create Success and Gain Productivity Every Day" by Maura Thomas',
'4️⃣ "Deep Work: Rules for Focused Success in a Distracted World" by Cal Newport',
"1️⃣ Record Each Hyperfocus Session:",
"2️⃣ Review and Reflect:",
"3️⃣ Adjust and Optimize:",
"1️⃣ What time of day do you feel most focused?",
"2️⃣ Which environment helps you concentrate the most?",
"3️⃣ What type of tasks do you find yourself getting lost in?",
]
# Make a copy to track which items we've found
remaining_items = required_items.copy()
# Iterate through all blocks to find matching text
for block in all_blocks:
block_text = notion_utils.get_block_plain_text(block).strip()
# Check if this block's text matches any of our required items
if block_text in remaining_items:
remaining_items.remove(block_text)
print(f"Found: {block_text}")
# Check if all required items were found
if len(remaining_items) == 0:
print("Success: All numbered lists have been converted to emoji numbers")
return True
else:
print(f"Error: Missing {len(remaining_items)} required items:", file=sys.stderr)
for item in remaining_items:
print(f" - {item}", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/description.md
================================================
Using Notion Tools. Complete the SOP template (a notion page titled 'Standard Operating Procedure') by filling in all sections with comprehensive, interconnected content for a "Software Deployment Process" SOP, ensuring all cross-references, terminologies, and procedural steps are properly linked and validated.
**Task Requirements:**
1. **Update the SOP header information** (in the left column):
- Change the heading_1 "SOP Title" text to "Software Deployment Process"
- Update the paragraph "Created 2023-10-25" to "Created 2025-01-19"
- Update the paragraph "Responsible department:" to "Responsible department: DevOps Engineering Team"
- Update the People team page's callout to: "DevOps Engineering Team Wiki - Contains team contact information, escalation procedures, and deployment schedules. Access required for all deployment activities."
2. **Fill the Purpose section** with exactly this content:
- Replace the placeholder paragraph (starts with "↓ Summarize the procedure") with: "This SOP defines the standardized process for deploying software applications to production environments, ensuring zero-downtime deployments, proper rollback procedures, and compliance with security protocols. This procedure applies to all production deployments and must be followed by all engineering teams."
3. **Complete the Context section** with:
- Replace the placeholder paragraph (starts with "↓ Add any related and useful information") with: "Software deployments are critical operations that can impact system availability and user experience. This process has been developed based on industry best practices and our incident response learnings from Q3 2023. All deployments must go through automated testing pipelines and require approval from designated reviewers."
- Update all THREE child_pages under the "Relevant Docs" toggle:
- First child_page callout (Contacting IT): "Change Management Policy (SOP-001) - Defines approval workflows and change review processes for all production modifications."
- Second child_page callout (Team lunches): "Incident Response Procedures (SOP-003) - Emergency procedures for handling deployment failures and system outages."
- Third child_page callout (Sending swag): "Security Compliance Guidelines (SOP-007) - Security requirements and validation steps for production deployments."
4. **Define comprehensive Terminologies** by:
- Replace the placeholder paragraph (starts with "↓ Add any unfamiliar or domain specific words") with: "Essential deployment terminology for team understanding:"
- Replace the existing bulleted_list_item "Term: The definition of the term" with these four exact items:
- "Blue-Green Deployment: A deployment strategy that maintains two identical production environments"
- "Rollback Window: The maximum time allowed to revert a deployment (30 minutes)"
- "Smoke Test: Initial verification tests run immediately after deployment"
- "Production Gateway: The approval checkpoint before production release"
5. **Populate Tools section** with:
- Replace the placeholder paragraph (starts with "↓ Add any relevant tools") with: "Critical tools required for deployment operations:"
- Update the TWO existing child_pages:
- First child_page callout: "Jenkins CI/CD Pipeline - Primary deployment automation tool with integrated testing and approval workflows. Required for all automated deployments."
- Second child_page callout: "Kubernetes Dashboard - Container orchestration monitoring and management interface for deployment verification and rollback operations."
6. **Complete Roles & responsibilities** with:
- Replace the placeholder paragraph (starts with "↓ Define who will be executing") with: "The following roles are essential for successful deployment execution:"
- Replace the existing empty bulleted_list_item with these four exact items:
- "DevOps Engineer: Executes deployment, monitors system health, initiates rollbacks if needed"
- "Lead Developer: Reviews code changes, approves deployment package, validates functionality"
- "QA Engineer: Verifies smoke tests, confirms user acceptance criteria"
- "Security Officer: Validates security compliance, approves security-sensitive deployments"
7. **Create detailed Procedure section** with:
- Replace the placeholder paragraph (starts with "↓ Create a step by step procedure") with: "Follow these steps in sequence. Do not skip steps or perform them out of order."
- Replace the THREE existing numbered_list_items with:
- "Pre-deployment: Verify all automated tests pass, obtain required approvals from Lead Developer and Security Officer, confirm rollback plan is documented and tested"
- "Deployment execution: Deploy to staging environment first, run comprehensive smoke tests, obtain final Production Gateway approval, deploy to production using blue-green strategy"
- "Post-deployment: Monitor system metrics for minimum 30 minutes, validate all functionality using automated tests, document deployment results in change log, notify all stakeholders via deployment notification system"
================================================
FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/meta.json
================================================
{
"task_id": "deployment_process_sop",
"task_name": "Deployment Process SOP",
"category_id": "standard_operating_procedure",
"category_name": "Standard Operating Procedure",
"description": "Complete the SOP template with comprehensive content for a Software Deployment Process with interconnected sections.",
"author": "Xiangyan Liu",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"template population",
"cross-reference linking",
"content organization",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure"
}
}
================================================
FILE: tasks/notion/standard/standard_operating_procedure/deployment_process_sop/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies comprehensive SOP template completion with exact content matching.
"""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(
notion, main_id
)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Standard Operating Procedure")
if not page_id:
print("Error: Page 'Standard Operating Procedure' not found.", file=sys.stderr)
return False
all_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
verification_results = []
# Check 1: Verify SOP header information updates
sop_title_found = False
created_date_found = False
responsible_dept_found = False
header_callout_found = False
for block in all_blocks:
if block.get("type") == "heading_1":
heading_text = notion_utils.get_block_plain_text(block)
if "Software Deployment Process" in heading_text:
sop_title_found = True
verification_results.append("✅ SOP Title updated correctly")
elif block.get("type") == "paragraph":
para_text = notion_utils.get_block_plain_text(block)
if "Created 2025-01-19" in para_text:
created_date_found = True
verification_results.append("✅ Created date updated correctly")
elif "Responsible department: DevOps Engineering Team" in para_text:
responsible_dept_found = True
verification_results.append(
"✅ Responsible department updated correctly"
)
elif block.get("type") == "child_page":
# Check child pages recursively for callout content - specifically the People team page
try:
child_page_info = notion.pages.retrieve(page_id=block["id"])
child_page_title = ""
if (
"properties" in child_page_info
and "title" in child_page_info["properties"]
):
title_list = child_page_info["properties"]["title"].get("title", [])
if title_list:
child_page_title = title_list[0].get("plain_text", "")
except:
child_page_title = ""
child_blocks = notion_utils.get_all_blocks_recursively(notion, block["id"])
for child_block in child_blocks:
if child_block.get("type") == "callout":
callout_text = notion_utils.get_block_plain_text(child_block)
# Look for the People team page with the DevOps Engineering Team Wiki callout
if (
"DevOps Engineering Team Wiki" in callout_text
and "deployment schedules" in callout_text
and "deployment activities" in callout_text
):
header_callout_found = True
verification_results.append(
"✅ Header People team page callout updated correctly"
)
# Check 2: Verify Purpose section content
purpose_found = False
expected_purpose = "This SOP defines the standardized process for deploying software applications to production environments"
for i, block in enumerate(all_blocks):
if block.get("type") == "heading_2":
heading_text = notion_utils.get_block_plain_text(block)
if "Purpose" in heading_text:
# Check next paragraph after Purpose heading
for j in range(i + 1, min(i + 5, len(all_blocks))):
next_block = all_blocks[j]
if next_block.get("type") == "paragraph":
para_text = notion_utils.get_block_plain_text(next_block)
if (
expected_purpose in para_text
and "engineering teams" in para_text
):
purpose_found = True
verification_results.append(
"✅ Purpose section content updated correctly"
)
break
break
# Check 3: Verify Context section and child_page callouts
context_found = False
child_pages_updated = 0
expected_context = "Software deployments are critical operations that can impact system availability"
expected_child_callouts = [
(
"Change Management Policy (SOP-001)",
"Defines approval workflows and change review processes for all production modifications",
"Contacting IT",
),
(
"Incident Response Procedures (SOP-003)",
"Emergency procedures for handling deployment failures and system outages",
"Team lunches",
),
(
"Security Compliance Guidelines (SOP-007)",
"Security requirements and validation steps for production deployments",
"Sending swag",
),
]
for i, block in enumerate(all_blocks):
if block.get("type") == "heading_2":
heading_text = notion_utils.get_block_plain_text(block)
if "Context" in heading_text:
# Check paragraph content
for j in range(i + 1, min(i + 10, len(all_blocks))):
next_block = all_blocks[j]
if next_block.get("type") == "paragraph":
para_text = notion_utils.get_block_plain_text(next_block)
if expected_context in para_text and "Q3 2023" in para_text:
context_found = True
elif next_block.get("type") == "toggle":
# Check child pages under toggle
toggle_blocks = notion_utils.get_all_blocks_recursively(
notion, next_block["id"]
)
for toggle_child in toggle_blocks:
if toggle_child.get("type") == "child_page":
# Get the child page title to match with expected callouts
try:
child_page_info = notion.pages.retrieve(
page_id=toggle_child["id"]
)
child_page_title = ""
if (
"properties" in child_page_info
and "title" in child_page_info["properties"]
):
title_list = child_page_info["properties"][
"title"
].get("title", [])
if title_list:
child_page_title = title_list[0].get(
"plain_text", ""
)
except:
child_page_title = ""
child_blocks = notion_utils.get_all_blocks_recursively(
notion, toggle_child["id"]
)
for child_block in child_blocks:
if child_block.get("type") == "callout":
callout_text = (
notion_utils.get_block_plain_text(
child_block
)
)
for (
expected_title,
expected_content,
expected_page_title,
) in expected_child_callouts:
if (
expected_title in callout_text
and expected_content in callout_text
and expected_page_title
in child_page_title
):
child_pages_updated += 1
verification_results.append(
f"✅ Context child_page '{expected_page_title}' updated correctly"
)
break
if context_found:
verification_results.append("✅ Context section content updated correctly")
if child_pages_updated == 3:
verification_results.append(
"✅ All 3 Context child_page callouts updated correctly"
)
else:
verification_results.append(
f"❌ Only {child_pages_updated}/3 Context child_page callouts updated correctly (Contacting IT, Team lunches, Sending swag)"
)
# Check 4: Verify Terminologies section with exact 4 bulleted items
terminologies_found = False
terminology_items = []
expected_terminologies = [
"Blue-Green Deployment: A deployment strategy that maintains two identical production environments",
"Rollback Window: The maximum time allowed to revert a deployment (30 minutes)",
"Smoke Test: Initial verification tests run immediately after deployment",
"Production Gateway: The approval checkpoint before production release",
]
for i, block in enumerate(all_blocks):
if block.get("type") == "heading_2":
heading_text = notion_utils.get_block_plain_text(block)
if "Terminologies" in heading_text:
# Check for intro paragraph
for j in range(i + 1, min(i + 2, len(all_blocks))):
if all_blocks[j].get("type") == "paragraph":
para_text = notion_utils.get_block_plain_text(all_blocks[j])
if "Essential deployment terminology" in para_text:
terminologies_found = True
break
# Check bulleted list items
for j in range(i + 1, min(i + 10, len(all_blocks))):
next_block = all_blocks[j]
if next_block.get("type") == "bulleted_list_item":
item_text = notion_utils.get_block_plain_text(next_block)
terminology_items.append(item_text)
elif next_block.get("type") in [
"heading_1",
"heading_2",
"heading_3",
]:
break
break
terminology_matches = sum(
1
for expected in expected_terminologies
if any(expected in item for item in terminology_items)
)
if terminologies_found and len(terminology_items) == 4 and terminology_matches == 4:
verification_results.append(
"✅ Terminologies section with exactly 4 correct items"
)
else:
verification_results.append(
f"❌ Terminologies: expected 4 items, found {len(terminology_items)}, {terminology_matches} correct"
)
# Check 5: Verify Tools section with 2 child_page callouts
tools_found = False
tools_child_pages = 0
expected_tools = [
("Jenkins CI/CD Pipeline", "automated deployments"),
("Kubernetes Dashboard", "rollback operations"),
]
for i, block in enumerate(all_blocks):
if block.get("type") == "heading_2":
heading_text = notion_utils.get_block_plain_text(block)
if "Tools" in heading_text:
# Check intro paragraph
for j in range(i + 1, min(i + 2, len(all_blocks))):
if all_blocks[j].get("type") == "paragraph":
para_text = notion_utils.get_block_plain_text(all_blocks[j])
if "Critical tools required" in para_text:
tools_found = True
break
# Check child pages
for j in range(i + 1, min(i + 10, len(all_blocks))):
next_block = all_blocks[j]
if next_block.get("type") == "child_page":
child_blocks = notion_utils.get_all_blocks_recursively(
notion, next_block["id"]
)
for child_block in child_blocks:
if child_block.get("type") == "callout":
callout_text = notion_utils.get_block_plain_text(
child_block
)
for expected_title, expected_content in expected_tools:
if (
expected_title in callout_text
and expected_content in callout_text
):
tools_child_pages += 1
break
elif next_block.get("type") in [
"heading_1",
"heading_2",
"heading_3",
]:
break
break
if tools_found and tools_child_pages == 2:
verification_results.append(
"✅ Tools section with 2 correctly updated child_page callouts"
)
else:
verification_results.append(
f"❌ Tools section: expected 2 child_pages updated, found {tools_child_pages}"
)
# Check 6: Verify Roles & responsibilities with exactly 4 bulleted items
roles_found = False
role_items = []
expected_roles = [
"DevOps Engineer: Executes deployment, monitors system health, initiates rollbacks if needed",
"Lead Developer: Reviews code changes, approves deployment package, validates functionality",
"QA Engineer: Verifies smoke tests, confirms user acceptance criteria",
"Security Officer: Validates security compliance, approves security-sensitive deployments",
]
for i, block in enumerate(all_blocks):
if block.get("type") == "heading_2":
heading_text = notion_utils.get_block_plain_text(block)
if "Roles" in heading_text and "responsibilities" in heading_text:
# Check intro paragraph
for j in range(i + 1, min(i + 2, len(all_blocks))):
if all_blocks[j].get("type") == "paragraph":
para_text = notion_utils.get_block_plain_text(all_blocks[j])
if "essential for successful deployment execution" in para_text:
roles_found = True
break
# Check bulleted list items
for j in range(i + 1, min(i + 10, len(all_blocks))):
next_block = all_blocks[j]
if next_block.get("type") == "bulleted_list_item":
item_text = notion_utils.get_block_plain_text(next_block)
role_items.append(item_text)
elif next_block.get("type") in [
"heading_1",
"heading_2",
"heading_3",
]:
break
break
role_matches = sum(
1 for expected in expected_roles if any(expected in item for item in role_items)
)
if roles_found and len(role_items) == 4 and role_matches == 4:
verification_results.append(
"✅ Roles & responsibilities section with exactly 4 correct items"
)
else:
verification_results.append(
f"❌ Roles section: expected 4 items, found {len(role_items)}, {role_matches} correct"
)
# Check 7: Verify Procedure section with exactly 3 numbered items
procedure_found = False
procedure_items = []
expected_procedures = [
("Pre-deployment", "Lead Developer and Security Officer", "rollback plan"),
("Deployment execution", "staging environment first", "blue-green strategy"),
(
"Post-deployment",
"minimum 30 minutes",
"stakeholders via deployment notification",
),
]
for i, block in enumerate(all_blocks):
if block.get("type") == "heading_2":
heading_text = notion_utils.get_block_plain_text(block)
if "Procedure" in heading_text:
# Check intro paragraph
for j in range(i + 1, min(i + 2, len(all_blocks))):
if all_blocks[j].get("type") == "paragraph":
para_text = notion_utils.get_block_plain_text(all_blocks[j])
if "Follow these steps in sequence" in para_text:
procedure_found = True
break
# Check numbered list items
for j in range(i + 1, min(i + 10, len(all_blocks))):
next_block = all_blocks[j]
if next_block.get("type") == "numbered_list_item":
item_text = notion_utils.get_block_plain_text(next_block)
procedure_items.append(item_text)
elif next_block.get("type") in [
"heading_1",
"heading_2",
"heading_3",
]:
break
break
procedure_matches = 0
for item_text in procedure_items:
for expected_title, expected_content1, expected_content2 in expected_procedures:
if (
expected_title in item_text
and expected_content1 in item_text
and expected_content2 in item_text
):
procedure_matches += 1
break
if procedure_found and len(procedure_items) == 3 and procedure_matches == 3:
verification_results.append("✅ Procedure section with exactly 3 correct items")
else:
verification_results.append(
f"❌ Procedure: expected 3 items, found {len(procedure_items)}, {procedure_matches} correct"
)
# Calculate overall success
total_checks = 14 # Number of major verification points
successful_checks = sum(
1 for result in verification_results if result.startswith("✅")
)
# Print all verification results
print("\n=== SOP Template Verification Results ===", file=sys.stderr)
for result in verification_results:
print(result, file=sys.stderr)
print(f"\n=== Summary: {successful_checks}/{total_checks} checks passed ===")
# Must pass ALL checks to succeed
success = (
sop_title_found
and created_date_found
and responsible_dept_found
and header_callout_found
and purpose_found
and context_found
and child_pages_updated == 3
and terminologies_found
and len(terminology_items) == 4
and terminology_matches == 4
and tools_found
and tools_child_pages == 2
and roles_found
and len(role_items) == 4
and role_matches == 4
and procedure_found
and len(procedure_items) == 3
and procedure_matches == 3
)
if success:
print("\n🎉 SUCCESS: All SOP template requirements completed correctly!")
return True
else:
print(
f"\n❌ FAILURE: SOP template verification failed. {successful_checks}/{total_checks} requirements met.",
file=sys.stderr,
)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/standard_operating_procedure/section_organization/description.md
================================================
# Task: Reorganize Standard Operating Procedure Page Sections
## Objective
Modify the structure of the Standard Operating Procedure page in Notion by reorganizing sections through swapping and creating a column layout.
## Requirements
### Step 1: Swap Sections
- Navigate to the Standard Operating Procedure page
- Swap the positions of the "Terminologies" and "Roles & responsibilities" sections
- Preserve all content within each section exactly as is
- Maintain the original formatting and structure of each section
### Step 2: Create Column Layout
- After swapping, arrange the "Tools" section and the section immediately below it ("Terminologies") into a 2-column layout
- Position the "Tools" section in the left column
- Position the "Terminologies" section in the right column
- In the "Tools" column, add links to the Notion and Figma pages using appropriate reference blocks
- Preserve the original child pages from the "Tools" section in a toggle block placed below the column layout, with the toggle titled "original pages"
================================================
FILE: tasks/notion/standard/standard_operating_procedure/section_organization/meta.json
================================================
{
"task_id": "section_organization",
"task_name": "Section Organization",
"category_id": "standard_operating_procedure",
"category_name": "Standard Operating Procedure",
"description": "Reorganize the Standard Operating Procedure page by swapping sections and creating a column layout.",
"author": "Xiangyan Liu",
"created_at": "2025-08-11",
"difficulty": "L3",
"tags": [
"content organization",
"cross-reference linking",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Standard-Operating-Procedure-24381626b6d780a8b678f9e62ae5b152",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/standard-operating-procedure"
}
}
================================================
FILE: tasks/notion/standard/standard_operating_procedure/section_organization/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Standard Operating Procedure page has been reorganized correctly.
"""
# Step 1: Find the Standard Operating Procedure page
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != 'page':
print("Error: Standard Operating Procedure page not found.", file=sys.stderr)
return False
else:
# Try to find the page by searching
found_id = notion_utils.find_page(notion, "Standard Operating Procedure")
if not found_id:
print("Error: Standard Operating Procedure page not found.", file=sys.stderr)
return False
print(f"Found Standard Operating Procedure page: {found_id}")
# Get all blocks from the page
all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
print(f"Found {len(all_blocks)} blocks")
print("Starting verification...")
# Step 2: Verify the structure and section order
print("2. Checking page structure and section order...")
# Expected structure after the initial content and dividers
# We'll look for main sections by their headings
roles_index = None
tools_column_index = None
toggle_index = None
procedure_index = None
for i, block in enumerate(all_blocks):
if block.get("type") == "heading_2":
heading_text = ""
rich_text = block.get("heading_2", {}).get("rich_text", [])
if rich_text:
heading_text = rich_text[0].get("text", {}).get("content", "")
if heading_text == "Roles & responsibilities":
roles_index = i
print(f"✓ Found 'Roles & responsibilities' section at index {i}")
elif heading_text == "Procedure":
procedure_index = i
print(f"✓ Found 'Procedure' section at index {i}")
# Check for column_list (containing Tools and Terminologies)
for i, block in enumerate(all_blocks):
if block.get("type") == "column_list":
# Check if this is the right column_list (should be after Roles & responsibilities)
if roles_index and i > roles_index:
tools_column_index = i
print(f"✓ Found column_list at index {i}")
break
# Check for toggle block with "original pages"
for i, block in enumerate(all_blocks):
if block.get("type") == "toggle":
toggle_text = ""
rich_text = block.get("toggle", {}).get("rich_text", [])
if rich_text:
toggle_text = rich_text[0].get("text", {}).get("content", "")
if toggle_text.lower() == "original pages":
toggle_index = i
print(f"✓ Found 'original pages' toggle at index {i}")
break
# Step 3: Verify section order
print("3. Verifying section order...")
if roles_index is None:
print("Error: 'Roles & responsibilities' section not found.", file=sys.stderr)
return False
if tools_column_index is None:
print("Error: Column layout not found.", file=sys.stderr)
return False
if toggle_index is None:
print("Error: 'original pages' toggle not found.", file=sys.stderr)
return False
if procedure_index is None:
print("Error: 'Procedure' section not found.", file=sys.stderr)
return False
# Verify order: Roles & responsibilities < column_list < toggle < Procedure
if not (roles_index < tools_column_index < toggle_index < procedure_index):
print("Error: Sections are not in the correct order.", file=sys.stderr)
print(f" Expected order: Roles & responsibilities ({roles_index}) < column_list ({tools_column_index}) < toggle ({toggle_index}) < Procedure ({procedure_index})", file=sys.stderr)
return False
print("✓ Sections are in the correct order")
# Step 4: Verify column_list structure
print("4. Verifying column layout structure...")
column_list_block = all_blocks[tools_column_index]
column_list_id = column_list_block.get("id")
# Get direct children of column_list (should be columns only)
try:
column_response = notion.blocks.children.list(block_id=column_list_id)
column_children = column_response.get("results", [])
except Exception as e:
print(f"Error getting column children: {e}", file=sys.stderr)
return False
if len(column_children) < 2:
print(f"Error: Column list should have at least 2 columns, found {len(column_children)}.", file=sys.stderr)
return False
# Verify left column (Tools)
left_column = column_children[0]
if left_column.get("type") != "column":
print("Error: First child of column_list should be a column.", file=sys.stderr)
return False
left_column_id = left_column.get("id")
left_column_blocks = notion_utils.get_all_blocks_recursively(notion, left_column_id)
# Check for Tools heading and link_to_page blocks in left column
tools_heading_found = False
link_to_page_count = 0
for block in left_column_blocks:
if block.get("type") == "heading_2":
heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
if heading_text == "Tools":
tools_heading_found = True
print("✓ Found 'Tools' heading in left column")
elif block.get("type") == "link_to_page":
link_to_page_count += 1
if not tools_heading_found:
print("Error: 'Tools' heading not found in left column.", file=sys.stderr)
return False
# Check for link_to_page blocks in Tools column
if link_to_page_count < 2:
print(f"Error: Tools column should have at least 2 link_to_page blocks, found {link_to_page_count}.", file=sys.stderr)
return False
print(f"✓ Found {link_to_page_count} link_to_page blocks in Tools column")
# Verify right column (Terminologies)
right_column = column_children[1]
if right_column.get("type") != "column":
print("Error: Second child of column_list should be a column.", file=sys.stderr)
return False
right_column_id = right_column.get("id")
right_column_blocks = notion_utils.get_all_blocks_recursively(notion, right_column_id)
# Check for Terminologies heading in right column
terminologies_heading_found = False
for block in right_column_blocks:
if block.get("type") == "heading_2":
heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
if heading_text == "Terminologies":
terminologies_heading_found = True
print("✓ Found 'Terminologies' heading in right column")
break
if not terminologies_heading_found:
print("Error: 'Terminologies' heading not found in right column.", file=sys.stderr)
return False
# Step 5: Verify toggle block content
print("5. Verifying toggle block content...")
toggle_block = all_blocks[toggle_index]
toggle_id = toggle_block.get("id")
# Get direct children of toggle
try:
toggle_response = notion.blocks.children.list(block_id=toggle_id)
toggle_children = toggle_response.get("results", [])
except Exception as e:
print(f"Error getting toggle children: {e}", file=sys.stderr)
return False
# Check for child_page blocks (Notion and Figma)
notion_page_found = False
figma_page_found = False
for block in toggle_children:
if block.get("type") == "child_page":
title = block.get("child_page", {}).get("title", "")
if title == "Notion":
notion_page_found = True
print("✓ Found 'Notion' child page in toggle")
elif title == "Figma":
figma_page_found = True
print("✓ Found 'Figma' child page in toggle")
if not notion_page_found:
print("Error: 'Notion' child page not found in toggle block.", file=sys.stderr)
return False
if not figma_page_found:
print("Error: 'Figma' child page not found in toggle block.", file=sys.stderr)
return False
# Step 6: Verify that original sections no longer exist at top level
print("6. Verifying original sections have been removed from top level...")
# Check that there's no standalone "Terminologies" heading before "Roles & responsibilities"
for i in range(0, roles_index if roles_index else len(all_blocks)):
block = all_blocks[i]
if block.get("type") == "heading_2":
heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
if heading_text == "Terminologies":
print("Error: 'Terminologies' section found before 'Roles & responsibilities'.", file=sys.stderr)
return False
# Check that there's no standalone "Tools" heading outside the column
tools_outside_column = False
for i, block in enumerate(all_blocks):
if i == tools_column_index:
continue # Skip the column_list itself
if block.get("type") == "heading_2":
heading_text = block.get("heading_2", {}).get("rich_text", [{}])[0].get("text", {}).get("content", "")
if heading_text == "Tools" and i != tools_column_index:
# Check if this is NOT inside the column
parent_id = block.get("parent", {}).get("block_id")
if parent_id != left_column_id:
tools_outside_column = True
break
if tools_outside_column:
print("Error: Standalone 'Tools' section found outside column layout.", file=sys.stderr)
return False
print("✓ Original sections have been properly reorganized")
# Step 7: Final summary
print("\n7. Final verification summary:")
print("✓ 'Roles & responsibilities' and 'Terminologies' sections have been swapped")
print("✓ 'Tools' and 'Terminologies' are in a 2-column layout")
print("✓ Links to Notion and Figma pages are in the Tools column")
print("✓ Original child pages are preserved in 'original pages' toggle")
print("✓ Page structure is correct")
print("\n✅ All verification checks passed!")
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/team_projects/priority_tasks_table/description.md
================================================
Hi! In my Team Projects page, please create a five-column table block that lists all tasks meeting either of the following conditions:
1. The progress is 50% or less, or
2. The task has priority P0 but is not yet completed (i.e., progress not at 100%).
You should query this information from the existing “Projects” database.
In the newly created table, each row should represent one task, and all information should be stored as plain text (not relations, formulas, or linked properties).
In the newly created table:
• Each row should represent one task
• All fields should be stored as plain text (not relations, formulas, or linked properties)
• The table should be sorted by expected end date (End Date) in ascending order, so that the first entry is the one with the earliest end date
The table should include the following headers:
• Project
• Eng Hours
• Progress
• Start Date
• End Date
Please make sure all relevant tasks are included. Thank you!
================================================
FILE: tasks/notion/standard/team_projects/priority_tasks_table/meta.json
================================================
{
"task_id": "priority_tasks_table",
"task_name": "Priority Tasks Table",
"category_id": "team_projects",
"category_name": "Team Projects",
"description": "Create a five-column table listing tasks with 50% or less progress or P0 priority tasks not completed.",
"author": "Zijian Wu",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"conditional filtering",
"database manipulation",
"data aggregation",
"visual formatting"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart"
}
}
================================================
FILE: tasks/notion/standard/team_projects/priority_tasks_table/verify.py
================================================
import sys
from datetime import datetime
from notion_client import Client
from tasks.utils import notion_utils
EXPECTED_HEADERS = ["Project", "Eng Hours", "Progress", "Start Date", "End Date"]
EXPECTED_ROWS = [
{
"Project": "Improve response times for support requests",
"Eng Hours": 100,
"Progress": 0.33, # 33%
"Start Date": "2024-10-30",
"End Date": "2024-11-17",
},
{
"Project": "Add a new social media integration",
"Eng Hours": 200,
"Progress": 0.40, # 40%
"Start Date": "2024-11-07",
"End Date": "2024-11-25",
},
{
"Project": "Integrate with a popular third-party service",
"Eng Hours": 250,
"Progress": 0.20, # 20%
"Start Date": "2024-11-10",
"End Date": "2024-11-18",
},
{
"Project": "Create customer knowledge base",
"Eng Hours": 150,
"Progress": 0.40, # 40%
"Start Date": "2024-11-19",
"End Date": "2024-11-25",
},
{
"Project": "Redesign the onboarding process",
"Eng Hours": 300,
"Progress": 0.75, # 75%
"Start Date": "2024-11-20",
"End Date": "2024-12-04",
},
{
"Project": "Publish support knowledge base",
"Eng Hours": None, # N/A
"Progress": 0.0, # 0%
"Start Date": "2024-11-27",
"End Date": "2024-11-29",
},
]
# Sort the expected rows by End Date so we can directly compare order
EXPECTED_ROWS.sort(key=lambda r: r["End Date"])
def _plain_text_from_cell(cell):
"""Concatenate plain_text from a single cell (list of rich_text)."""
return "".join(rt.get("plain_text", "") for rt in cell).strip()
def _parse_progress(value: str):
"""Convert a progress string like '40%', '40.0 %', '0.4' to float in range 0-1."""
value = value.strip()
if not value:
return None
has_percent = "%" in value
# Remove percent sign and any spaces
value = value.replace("%", "").strip()
try:
num = float(value)
if has_percent or num > 1:
num /= 100.0
return num
except ValueError:
return None
def _parse_eng_hours(value: str):
value = value.strip().lower()
if value in {"n/a", "na", "", "—", "-"}:
return None
try:
return float(value)
except ValueError:
return None
def _parse_date(value: str):
value = value.strip()
try:
return datetime.strptime(value, "%Y-%m-%d").date()
except ValueError:
return None
def verify(notion: Client, main_id: str = None) -> bool:
"""Verify that the last table in the 'Team Projects' page matches EXPECTED_ROWS and headers."""
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if found_id and object_type == 'page':
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Team Projects")
if not page_id:
print("Error: Page 'Team Projects' not found.", file=sys.stderr)
return False
# Fetch all blocks to locate table blocks
blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
table_blocks = [b for b in blocks if b.get("type") == "table"]
if not table_blocks:
print("Error: No table blocks found in 'Team Projects' page.", file=sys.stderr)
return False
table_block = table_blocks[-1] # Use the last table block
table_id = table_block["id"]
# Retrieve table rows
rows = notion.blocks.children.list(block_id=table_id).get("results", [])
if not rows:
print("Error: Table block has no rows.", file=sys.stderr)
return False
# Validate headers
header_cells = rows[0].get("table_row", {}).get("cells", [])
headers = [_plain_text_from_cell(c) for c in header_cells]
if headers != EXPECTED_HEADERS:
print(f"Error: Table headers mismatch. Found {headers}, expected {EXPECTED_HEADERS}.", file=sys.stderr)
return False
# Parse data rows
data_rows = []
for r in rows[1:]:
cells = r.get("table_row", {}).get("cells", [])
if len(cells) < 5:
continue # Skip malformed rows
project = _plain_text_from_cell(cells[0])
eng_hours_raw = _plain_text_from_cell(cells[1])
progress_raw = _plain_text_from_cell(cells[2])
start_raw = _plain_text_from_cell(cells[3])
end_raw = _plain_text_from_cell(cells[4])
row_dict = {
"Project": project,
"Eng Hours": _parse_eng_hours(eng_hours_raw),
"Progress": _parse_progress(progress_raw),
"Start Date": start_raw.strip(),
"End Date": end_raw.strip(),
}
data_rows.append(row_dict)
if len(data_rows) != len(EXPECTED_ROWS):
print(f"Error: Expected {len(EXPECTED_ROWS)} data rows, found {len(data_rows)}.", file=sys.stderr)
return False
# Verify sorting by End Date ascending
parsed_end_dates = [_parse_date(r["End Date"]) for r in data_rows]
if any(d is None for d in parsed_end_dates):
print("Error: One or more End Date values could not be parsed.", file=sys.stderr)
return False
if parsed_end_dates != sorted(parsed_end_dates):
print("Error: Table is not sorted by End Date ascending.", file=sys.stderr)
return False
# Create mapping from project -> row for comparison
data_map = {r["Project"]: r for r in data_rows}
for expected in EXPECTED_ROWS:
proj = expected["Project"]
if proj not in data_map:
print(f"Error: Project '{proj}' not found in table.", file=sys.stderr)
return False
actual = data_map[proj]
# Compare Eng Hours
expected_hours = expected["Eng Hours"]
actual_hours = actual["Eng Hours"]
if expected_hours is None:
if actual_hours is not None:
print(f"Error: Eng Hours for '{proj}' expected to be empty/N\u204aA but found '{actual_hours}'.", file=sys.stderr)
return False
else:
if actual_hours is None or abs(actual_hours - expected_hours) > 1e-2:
print(f"Error: Eng Hours for '{proj}' mismatch. Expected {expected_hours}, found {actual_hours}.", file=sys.stderr)
return False
# Compare Progress with tolerance
expected_progress = expected["Progress"]
actual_progress = actual["Progress"]
if actual_progress is None or abs(actual_progress - expected_progress) > 1e-2:
print(f"Error: Progress for '{proj}' mismatch. Expected {expected_progress}, found {actual_progress}.", file=sys.stderr)
return False
# Compare Start and End Dates (string equality)
if actual["Start Date"] != expected["Start Date"]:
print(f"Error: Start Date for '{proj}' mismatch. Expected {expected['Start Date']}, found {actual['Start Date']}.", file=sys.stderr)
return False
if actual["End Date"] != expected["End Date"]:
print(f"Error: End Date for '{proj}' mismatch. Expected {expected['End Date']}, found {actual['End Date']}.", file=sys.stderr)
return False
print("Success: Verified table block contents and order successfully.")
return True
def main():
"""Execute verification and exit with status code."""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/team_projects/swap_tasks/description.md
================================================
Go to the Team Projects page, find the person responsible for the most tasks and the person responsible for the fewest tasks, then swap their assigned tasks.
================================================
FILE: tasks/notion/standard/team_projects/swap_tasks/meta.json
================================================
{
"task_id": "swap_tasks",
"task_name": "Swap Tasks",
"category_id": "team_projects",
"category_name": "Team Projects",
"description": "Find the person responsible for the most and fewest tasks, then swap their assigned tasks.",
"author": "Xiangyan Liu",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"data aggregation",
"automated migration",
"conditional filtering"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Team-Projects-24e81626b6d7809c982fdb7a25825898",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/gantt-chart"
}
}
================================================
FILE: tasks/notion/standard/team_projects/swap_tasks/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the task assignees have been swapped correctly.
Checks:
1. "Develop a plan for promotion" and "Evaluate different third-party services" have swapped assignees
2. The person with most tasks and person with least tasks have swapped all their tasks
"""
# Step 1: Find the Team Projects page
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != 'page':
print("Error: Team Projects page not found.", file=sys.stderr)
return False
else:
# Try to find the page by searching
found_id = notion_utils.find_page(notion, "Team Projects")
if not found_id:
print("Error: Team Projects page not found.", file=sys.stderr)
return False
# Get all blocks from the page to find database references
all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
# Find Tasks database ID from the page
tasks_db_id = None
for block in all_blocks:
if block and block.get("type") == "child_database":
db_title = block.get("child_database", {}).get("title", "")
if "Tasks" in db_title:
tasks_db_id = block["id"]
break
if not tasks_db_id:
print("Error: Tasks database not found.", file=sys.stderr)
return False
print("\n📋 Starting verification...")
# Step 2: Query all tasks to analyze assignees
try:
all_tasks_response = notion.databases.query(
database_id=tasks_db_id,
page_size=100
)
if not all_tasks_response.get("results"):
print("Error: No tasks found in Tasks database.", file=sys.stderr)
return False
tasks = all_tasks_response["results"]
except Exception as e:
print(f"Error querying Tasks database: {e}", file=sys.stderr)
return False
# Step 3: Check specific tasks have swapped assignees
develop_plan_task = None
evaluate_services_task = None
for task in tasks:
task_name = task["properties"]["Name"]["title"][0]["text"]["content"]
if task_name == "Develop a plan for promotion":
develop_plan_task = task
elif task_name == "Evaluate different third-party services":
evaluate_services_task = task
if not develop_plan_task or not evaluate_services_task:
print("Error: Could not find both required tasks.", file=sys.stderr)
return False
# Get assignees for these tasks
develop_plan_assignees = develop_plan_task["properties"]["Assigned"]["people"]
evaluate_services_assignees = evaluate_services_task["properties"]["Assigned"]["people"]
if not develop_plan_assignees or not evaluate_services_assignees:
print("Error: Tasks don't have assignees.", file=sys.stderr)
return False
develop_plan_assignee_id = develop_plan_assignees[0]["id"]
evaluate_services_assignee_id = evaluate_services_assignees[0]["id"]
# These should be different (swapped)
if develop_plan_assignee_id == evaluate_services_assignee_id:
print("Error: Tasks should have different assignees after swap.", file=sys.stderr)
return False
# Step 4: Count tasks per person
task_counts = {}
unassigned_count = 0
for task in tasks:
assignees = task["properties"]["Assigned"]["people"]
if assignees:
assignee_id = assignees[0]["id"]
if assignee_id not in task_counts:
task_counts[assignee_id] = []
task_counts[assignee_id].append(task["properties"]["Name"]["title"][0]["text"]["content"])
else:
unassigned_count += 1
# Sort by task count
sorted_assignees = sorted(task_counts.items(), key=lambda x: len(x[1]))
if len(sorted_assignees) < 2:
print("Error: Need at least 2 people with tasks to verify swap.", file=sys.stderr)
return False
# Get person with least and most tasks
person_with_least = sorted_assignees[0]
person_with_most = sorted_assignees[-1]
least_id, least_tasks = person_with_least
most_id, most_tasks = person_with_most
# Step 5: Verify the swap pattern
# Original distribution (before swap):
# - 5ac96c02-49a4-4320-8de6-b663ba83126b had 3 tasks (least)
# - ac7a3bd0-c111-4464-8f45-8a857a1abc8a had 10 tasks (most)
# After complete swap, we expect:
# - 5ac96c02-49a4-4320-8de6-b663ba83126b should have 10 tasks
# - ac7a3bd0-c111-4464-8f45-8a857a1abc8a should have 3 tasks
original_least_id = "5ac96c02-49a4-4320-8de6-b663ba83126b"
original_most_id = "ac7a3bd0-c111-4464-8f45-8a857a1abc8a"
# Check if the swap has been completed
swap_completed = False
for assignee_id, assignee_tasks in task_counts.items():
if assignee_id == original_least_id and len(assignee_tasks) == 10:
# Person who had 3 now has 10
for other_id, other_tasks in task_counts.items():
if other_id == original_most_id and len(other_tasks) == 3:
# Person who had 10 now has 3
swap_completed = True
break
# Step 6: Summary
print(f"\n📊 Task Distribution:")
print(f" • Total tasks: {len(tasks)}")
print(f" • Assigned tasks: {len(tasks) - unassigned_count}")
print(f" • Unassigned tasks: {unassigned_count}")
print(f" • People with tasks: {len(task_counts)}")
print(f"\n Task counts by person:")
for assignee_id, assignee_tasks in sorted_assignees:
print(f" - {assignee_id[:8]}...: {len(assignee_tasks)} tasks")
# Step 7: Final verification
print("\n🔍 Verification Results:")
# Check that the swap has created a significant difference
if len(most_tasks) - len(least_tasks) < 5:
print(f"Warning: Difference between most and least is only {len(most_tasks) - len(least_tasks)} tasks", file=sys.stderr)
# Verify specific expected outcomes
verification_passed = True
# Check 1: Specific tasks have been swapped
specific_tasks_swapped = develop_plan_assignee_id != evaluate_services_assignee_id
if specific_tasks_swapped:
print(" ✓ Specific tasks have been swapped")
else:
print(" ✗ Specific tasks were not swapped", file=sys.stderr)
verification_passed = False
# Check 2: Task distribution shows a complete swap
if swap_completed:
print(" ✓ Complete task swap verified (3↔10 tasks)")
else:
# Show actual distribution for debugging
person1_tasks = len(task_counts.get(original_least_id, []))
person2_tasks = len(task_counts.get(original_most_id, []))
print(f" ✗ Swap incomplete! Expected 5ac96c02→10 tasks, ac7a3bd0→3 tasks", file=sys.stderr)
print(f" Actual: 5ac96c02→{person1_tasks} tasks, ac7a3bd0→{person2_tasks} tasks", file=sys.stderr)
verification_passed = False
# Check 3: Total task count is preserved
total_assigned_tasks = sum(len(tasks) for _, tasks in task_counts.items())
expected_total = len(tasks) - unassigned_count
if total_assigned_tasks == expected_total:
print(f" ✓ Total task count preserved ({total_assigned_tasks} assigned)")
else:
print(f" ✗ Task count mismatch: {total_assigned_tasks} vs {expected_total} expected", file=sys.stderr)
verification_passed = False
if verification_passed:
print("\n✅ All verification checks passed!")
return True
else:
print("\n❌ Verification failed", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/toronto_guide/change_color/description.md
================================================
Navigate to the Toronto Guide page in Notion and change all pink-colored elements (tags and callout colors) to different colors.
## Requirements
1. Find and access the Toronto Guide page in Notion
2. Identify and change all pink elements including:
- Pink tags in databases
- Pink callout backgrounds
3. Change all pink colors to any other color of your choice
================================================
FILE: tasks/notion/standard/toronto_guide/change_color/meta.json
================================================
{
"task_id": "change_color",
"task_name": "Change Color",
"category_id": "toronto_guide",
"category_name": "Toronto Guide",
"description": "Navigate to the Toronto Guide page and change all pink-colored elements to different colors.",
"author": "Xiangyan Liu",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"visual formatting",
"conditional filtering"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide"
}
}
================================================
FILE: tasks/notion/standard/toronto_guide/change_color/verify.py
================================================
import sys
from notion_client import Client
from tasks.utils import notion_utils
def get_page_title(page_result):
"""Extract title from a page result"""
properties = page_result.get('properties', {})
for prop_name in ['Name', 'Title', 'title']:
if prop_name in properties:
prop = properties[prop_name]
if prop.get('type') == 'title':
title_array = prop.get('title', [])
if title_array and len(title_array) > 0:
return title_array[0].get('plain_text', '')
return ''
def get_page_tags(page_result):
"""Extract tags from a page result"""
properties = page_result.get('properties', {})
tags_property = properties.get('Tags', {})
if tags_property.get('type') == 'multi_select':
tags = tags_property.get('multi_select', [])
return [tag.get('name') for tag in tags]
return []
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that all pink colors have been changed in the Toronto Guide page.
Expected pink elements that should be changed:
1. Callout: "Welcome to Toronto!" with red_background (originally should be pink)
2. Activities database tags:
- "Parks" tag (High Park, Evergreen Brickworks)
- "Neighbourhood" tag (Ossington Strip, Chinatown, Little Italy, Kensington Market, Queen west, The beaches)
3. Food database tags:
- "Middle Eastern" (Byblos Downtown)
- "Jamaican" (Crumbs Patties)
- "Indian" (Leela Indian Food Bar)
4. Cafes database tag:
- "Food" (Cafe Landwer)
These elements should exist with the same name/content but different colors.
Tag distributions should remain the same.
"""
# Step 1: Find the main Toronto Guide page
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if not found_id or object_type != 'page':
print("Error: Toronto Guide page not found.", file=sys.stderr)
return False
else:
# Try to find the page by searching
found_id = notion_utils.find_page(notion, "Toronto Guide")
if not found_id:
print("Error: Toronto Guide page not found.", file=sys.stderr)
return False
print(f"Found Toronto Guide page: {found_id}")
# Get all blocks from the page
all_blocks = notion_utils.get_all_blocks_recursively(notion, found_id)
print(f"Found {len(all_blocks)} blocks")
# Expected elements and their distributions
expected_pink_elements = {
"callout": {
"text": "Welcome to Toronto!",
"found": False,
"has_pink": False,
"exists": False
},
"activities_tags": {
"Parks": {
"found": False,
"has_pink": False,
"expected_items": ["High Park", "Evergreen Brickworks"],
"actual_items": []
},
"Neighbourhood": {
"found": False,
"has_pink": False,
"expected_items": ["Ossington Strip", "Chinatown", "Little Italy", "Kensington Market", "Queen west", "The beaches"],
"actual_items": []
}
},
"food_tags": {
"Middle Eastern": {
"found": False,
"has_pink": False,
"expected_items": ["Byblos Downtown"],
"actual_items": []
},
"Jamaican": {
"found": False,
"has_pink": False,
"expected_items": ["Crumbs Patties"],
"actual_items": []
},
"Indian": {
"found": False,
"has_pink": False,
"expected_items": ["Leela Indian Food Bar"],
"actual_items": []
}
},
"cafes_tags": {
"Food": {
"found": False,
"has_pink": False,
"expected_items": ["Cafe Landwer"],
"actual_items": []
}
}
}
# Database IDs
activities_db_id = None
food_db_id = None
cafes_db_id = None
# Step 2: Check all blocks for callouts and find databases
for block in all_blocks:
if block is None:
continue
block_type = block.get("type")
# Check for the specific callout block
if block_type == "callout":
callout_text = notion_utils.get_block_plain_text(block)
if "Welcome to Toronto!" in callout_text:
expected_pink_elements["callout"]["exists"] = True
expected_pink_elements["callout"]["found"] = True
color = block.get("callout", {}).get("color", "")
if "pink" in color.lower():
expected_pink_elements["callout"]["has_pink"] = True
print(f"✗ Callout 'Welcome to Toronto!' still has pink color: {color}")
else:
print(f"✓ Callout 'Welcome to Toronto!' has non-pink color: {color}")
# Find child databases
elif block_type == "child_database":
title = block.get("child_database", {}).get("title", "")
block_id = block.get("id")
if "Activities" in title:
activities_db_id = block_id
print(f"Found Activities database: {block_id}")
elif "Food" in title:
food_db_id = block_id
print(f"Found Food database: {block_id}")
elif "Cafes" in title or "Café" in title:
cafes_db_id = block_id
print(f"Found Cafes database: {block_id}")
# Step 3: Check Activities database for specific tags and their distributions
if activities_db_id:
try:
# Get database properties
db_info = notion.databases.retrieve(database_id=activities_db_id)
tags_property = db_info.get("properties", {}).get("Tags", {})
if tags_property.get("type") == "multi_select":
options = tags_property.get("multi_select", {}).get("options", [])
for option in options:
tag_name = option.get("name").strip()
tag_color = option.get("color")
if tag_name in expected_pink_elements["activities_tags"]:
expected_pink_elements["activities_tags"][tag_name]["found"] = True
if tag_color == "pink":
expected_pink_elements["activities_tags"][tag_name]["has_pink"] = True
print(f"✗ Activities tag '{tag_name}' still has pink color")
else:
print(f"✓ Activities tag '{tag_name}' changed to {tag_color}")
# Query database to check tag distributions
query_result = notion.databases.query(database_id=activities_db_id)
for page in query_result.get('results', []):
page_title = get_page_title(page).strip()
page_tags = get_page_tags(page)
for tag_name in expected_pink_elements["activities_tags"]:
if tag_name in page_tags:
expected_pink_elements["activities_tags"][tag_name]["actual_items"].append(page_title)
except Exception as e:
print(f"Error checking Activities database: {e}", file=sys.stderr)
return False
else:
print("Error: Activities database not found", file=sys.stderr)
return False
# Step 4: Check Food database for specific tags and their distributions
if food_db_id:
try:
# Get database properties
db_info = notion.databases.retrieve(database_id=food_db_id)
tags_property = db_info.get("properties", {}).get("Tags", {})
if tags_property.get("type") == "multi_select":
options = tags_property.get("multi_select", {}).get("options", [])
for option in options:
tag_name = option.get("name").strip()
tag_color = option.get("color")
if tag_name in expected_pink_elements["food_tags"]:
expected_pink_elements["food_tags"][tag_name]["found"] = True
if tag_color == "pink":
expected_pink_elements["food_tags"][tag_name]["has_pink"] = True
print(f"✗ Food tag '{tag_name}' still has pink color")
else:
print(f"✓ Food tag '{tag_name}' changed to {tag_color}")
# Query database to check tag distributions
query_result = notion.databases.query(database_id=food_db_id)
for page in query_result.get('results', []):
page_title = get_page_title(page).strip()
page_tags = get_page_tags(page)
for tag_name in expected_pink_elements["food_tags"]:
if tag_name in page_tags:
expected_pink_elements["food_tags"][tag_name]["actual_items"].append(page_title)
except Exception as e:
print(f"Error checking Food database: {e}", file=sys.stderr)
return False
else:
print("Error: Food database not found", file=sys.stderr)
return False
# Step 5: Check Cafes database for specific tags and their distributions
if cafes_db_id:
try:
# Get database properties
db_info = notion.databases.retrieve(database_id=cafes_db_id)
tags_property = db_info.get("properties", {}).get("Tags", {})
if tags_property.get("type") == "multi_select":
options = tags_property.get("multi_select", {}).get("options", [])
for option in options:
tag_name = option.get("name").strip()
tag_color = option.get("color")
if tag_name in expected_pink_elements["cafes_tags"]:
expected_pink_elements["cafes_tags"][tag_name]["found"] = True
if tag_color == "pink":
expected_pink_elements["cafes_tags"][tag_name]["has_pink"] = True
print(f"✗ Cafes tag '{tag_name}' still has pink color")
else:
print(f"✓ Cafes tag '{tag_name}' changed to {tag_color}")
# Query database to check tag distributions
query_result = notion.databases.query(database_id=cafes_db_id)
for page in query_result.get('results', []):
page_title = get_page_title(page).strip()
page_tags = get_page_tags(page)
for tag_name in expected_pink_elements["cafes_tags"]:
if tag_name in page_tags:
expected_pink_elements["cafes_tags"][tag_name]["actual_items"].append(page_title)
except Exception as e:
print(f"Error checking Cafes database: {e}", file=sys.stderr)
return False
else:
print("Error: Cafes database not found", file=sys.stderr)
return False
# Step 6: Verify all requirements
print(f"\nVerification Summary:")
all_passed = True
# Check callout
if not expected_pink_elements["callout"]["exists"]:
print("✗ 'Welcome to Toronto!' callout not found", file=sys.stderr)
all_passed = False
elif expected_pink_elements["callout"]["has_pink"]:
print("✗ Callout still has pink background", file=sys.stderr)
all_passed = False
else:
print("✓ Callout color changed from pink")
# Check Activities tags
print("\nActivities Database Tags:")
for tag_name, tag_info in expected_pink_elements["activities_tags"].items():
if not tag_info["found"]:
print(f"✗ Activities tag '{tag_name}' not found (may have been renamed)", file=sys.stderr)
# Don't fail if tag was renamed, as that's acceptable
elif tag_info["has_pink"]:
print(f"✗ Activities tag '{tag_name}' still has pink color", file=sys.stderr)
all_passed = False
else:
print(f"✓ Activities tag '{tag_name}' color changed from pink")
# Check distribution
expected_set = set(tag_info["expected_items"])
actual_set = set(tag_info["actual_items"])
if tag_info["found"] and expected_set != actual_set:
print(f" ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr)
print(f" Expected: {sorted(expected_set)}", file=sys.stderr)
print(f" Actual: {sorted(actual_set)}", file=sys.stderr)
# Note: We don't fail on distribution mismatch if tag was renamed
if not (expected_set - actual_set): # If all expected items are present
print(f" (Additional items found, but all expected items are present)")
elif tag_info["found"]:
print(f" ✓ Tag distribution maintained for '{tag_name}'")
# Check Food tags
print("\nFood Database Tags:")
for tag_name, tag_info in expected_pink_elements["food_tags"].items():
if not tag_info["found"]:
print(f"✗ Food tag '{tag_name}' not found (may have been renamed)", file=sys.stderr)
# Don't fail if tag was renamed, as that's acceptable
elif tag_info["has_pink"]:
print(f"✗ Food tag '{tag_name}' still has pink color", file=sys.stderr)
all_passed = False
else:
print(f"✓ Food tag '{tag_name}' color changed from pink")
# Check distribution
expected_set = set(tag_info["expected_items"])
actual_set = set(tag_info["actual_items"])
if tag_info["found"] and expected_set != actual_set:
print(f" ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr)
print(f" Expected: {sorted(expected_set)}", file=sys.stderr)
print(f" Actual: {sorted(actual_set)}", file=sys.stderr)
elif tag_info["found"]:
print(f" ✓ Tag distribution maintained for '{tag_name}'")
# Check Cafes tags
print("\nCafes Database Tags:")
for tag_name, tag_info in expected_pink_elements["cafes_tags"].items():
if not tag_info["found"]:
print(f"✗ Cafes tag '{tag_name}' not found (may have been renamed)", file=sys.stderr)
# Don't fail if tag was renamed, as that's acceptable
elif tag_info["has_pink"]:
print(f"✗ Cafes tag '{tag_name}' still has pink color", file=sys.stderr)
all_passed = False
else:
print(f"✓ Cafes tag '{tag_name}' color changed from pink")
# Check distribution
expected_set = set(tag_info["expected_items"])
actual_set = set(tag_info["actual_items"])
if tag_info["found"] and expected_set != actual_set:
print(f" ✗ Tag distribution mismatch for '{tag_name}':", file=sys.stderr)
print(f" Expected: {sorted(expected_set)}", file=sys.stderr)
print(f" Actual: {sorted(actual_set)}", file=sys.stderr)
elif tag_info["found"]:
print(f" ✓ Tag distribution maintained for '{tag_name}'")
# Additional check: ensure no other pink elements exist
print("\nChecking for any other pink elements...")
other_pink_found = False
# Check all callouts for pink
for block in all_blocks:
if block and block.get("type") == "callout":
color = block.get("callout", {}).get("color", "")
if "pink" in color.lower():
callout_text = notion_utils.get_block_plain_text(block)[:50]
if "Welcome to Toronto!" not in callout_text:
print(f"✗ Found unexpected pink callout: {callout_text}...", file=sys.stderr)
other_pink_found = True
if other_pink_found:
all_passed = False
else:
print("✓ No unexpected pink elements found")
return all_passed
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
print("\nVerification passed: All expected pink colors have been changed")
sys.exit(0)
else:
print("\nVerification failed: Some pink colors still exist or elements are missing")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/description.md
================================================
Create a comprehensive weekend adventure planner that analyzes the Toronto Guide databases and generates a structured itinerary page. I need you to create a new page called 'Perfect Weekend Adventure' as a child of the main Toronto Guide page.
**Task Requirements:**
1. Create a new page titled 'Perfect Weekend Adventure' as a child page of the main Toronto Guide page
2. Query the Activities database to identify all activities that have the "Beaches" tag
3. Query the Food database to find all restaurants with "Turkish" or "Hakka" tags
4. Query the Cafes database to retrieve all cafes entries
5. Structure the page with the following specific format:
- Add a heading_1 block with text "🎒 Perfect Weekend Adventure"
- Add a heading_2 block with text "🏖️ Beach Activities"
- Under Beach Activities, create a bulleted list with all activities that have the "Beaches" tag, showing: Name - Google Maps Link (if available)
- Add a heading_2 block with text "🍽️ Cultural Dining Experience"
- Under Cultural Dining, create a numbered list of all restaurants with "Turkish" or "Hakka" tags, formatted as: Restaurant Name (Tag: [actual tag name])
- Add a heading_2 block with text "☕ Coffee Break Spots"
- Under Coffee Break Spots, create a toggle block titled "Top Cafes to Visit" containing all cafe entries as to-do items (unchecked), each showing just the cafe name
- Add a heading_2 block with text "📊 Weekend Summary"
- Under Weekend Summary, add a paragraph with the exact text: "This weekend includes [X] beach activities, [Y] cultural dining options, and [Z] coffee spots to explore!" where [X], [Y], and [Z] are the actual counts
6. After the summary paragraph, add a divider block
7. Finally, add a callout block with the 💡 emoji containing the text: "Pro tip: Check the Seasons database for the best time to enjoy outdoor activities!"
8. Ensure all headings use the exact emoji and text format specified above
9. The lists must be in the exact format specified (bulleted for beaches, numbered for restaurants, to-do for cafes)
================================================
FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/meta.json
================================================
{
"task_id": "weekend_adventure_planner",
"task_name": "Weekend Adventure Planner",
"category_id": "toronto_guide",
"category_name": "Toronto Guide",
"description": "Create a comprehensive weekend adventure planner that analyzes Toronto Guide databases and generates a structured itinerary page.",
"author": "Xiangyan Liu",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"conditional filtering",
"data aggregation",
"report generation",
"visual formatting",
"status tracking"
],
"mcp": [
"notion"
],
"meta_data": {
"stateType": "url",
"stateContent": null,
"stateUrl": "https://painted-tennis-ebc.notion.site/Toronto-Guide-25281626b6d7802caa7cc394647e901c",
"stateOriginalUrl": "https://www.notion.so/marketplace/templates/conquering-toronto-a-destination-guide"
}
}
================================================
FILE: tasks/notion/standard/toronto_guide/weekend_adventure_planner/verify.py
================================================
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
from notion_client import Client
from tasks.utils import notion_utils
def verify(notion: Client, main_id: str = None) -> bool:
"""
Verifies that the Perfect Weekend Adventure page has been created correctly.
"""
# Find the main Toronto Guide page
page_id = None
if main_id:
found_id, object_type = notion_utils.find_page_or_database_by_id(notion, main_id)
if found_id and object_type == "page":
page_id = found_id
if not page_id:
page_id = notion_utils.find_page(notion, "Toronto Guide")
if not page_id:
print("Error: Main 'Toronto Guide' page not found.", file=sys.stderr)
return False
# Find the Perfect Weekend Adventure child page
adventure_page_id = None
try:
response = notion.search(
query="Perfect Weekend Adventure",
filter={"property": "object", "value": "page"}
)
for result in response.get("results", []):
parent = result.get("parent", {})
if parent.get("type") == "page_id" and parent.get("page_id") == page_id:
adventure_page_id = result["id"]
break
if not adventure_page_id:
for result in response.get("results", []):
title_list = result.get("properties", {}).get("title", {}).get("title", [])
for title_obj in title_list:
if "Perfect Weekend Adventure" in title_obj.get("plain_text", ""):
adventure_page_id = result["id"]
break
if adventure_page_id:
break
except Exception as e:
print(f"Error searching for Perfect Weekend Adventure page: {e}", file=sys.stderr)
return False
if not adventure_page_id:
print("Error: 'Perfect Weekend Adventure' page not found as child of main page.", file=sys.stderr)
return False
# Get all blocks from the adventure page
all_blocks = notion_utils.get_all_blocks_recursively(notion, adventure_page_id)
# Get databases from the main Toronto Guide page
activities_db_id = None
food_db_id = None
cafes_db_id = None
main_blocks = notion_utils.get_all_blocks_recursively(notion, page_id)
for block in main_blocks:
if block.get("type") == "child_database":
title = block.get("child_database", {}).get("title", "")
if "Activities" in title:
activities_db_id = block.get("id")
elif "Food" in title:
food_db_id = block.get("id")
elif "Cafes" in title or "Caf�" in title:
cafes_db_id = block.get("id")
# Query databases to get expected data
beach_activities = []
cultural_restaurants = []
cafes_list = []
if activities_db_id:
try:
db_response = notion.databases.query(database_id=activities_db_id)
for page in db_response.get("results", []):
properties = page.get("properties", {})
tags_prop = properties.get("Tags", {})
if tags_prop.get("type") == "multi_select":
tags = [tag.get("name") for tag in tags_prop.get("multi_select", [])]
if "Beaches" in tags:
name_prop = properties.get("Name", {})
if name_prop.get("type") == "title" and name_prop.get("title"):
name = name_prop["title"][0]["plain_text"]
url_prop = properties.get("Google Maps Link", {})
url = url_prop.get("url", "") if url_prop.get("type") == "url" else ""
beach_activities.append({"name": name, "url": url})
except Exception as e:
print(f"Error querying Activities database: {e}", file=sys.stderr)
return False
if food_db_id:
try:
db_response = notion.databases.query(database_id=food_db_id)
for page in db_response.get("results", []):
properties = page.get("properties", {})
tags_prop = properties.get("Tags", {})
if tags_prop.get("type") == "multi_select":
tags = [tag.get("name") for tag in tags_prop.get("multi_select", [])]
for tag in tags:
if tag in ["Turkish", "Hakka"]:
name_prop = properties.get("Name", {})
if name_prop.get("type") == "title" and name_prop.get("title"):
name = name_prop["title"][0]["plain_text"]
cultural_restaurants.append({"name": name, "tag": tag})
break
except Exception as e:
print(f"Error querying Food database: {e}", file=sys.stderr)
return False
if cafes_db_id:
try:
db_response = notion.databases.query(database_id=cafes_db_id)
for page in db_response.get("results", []):
properties = page.get("properties", {})
name_prop = properties.get("Name", {})
if name_prop.get("type") == "title" and name_prop.get("title"):
name = name_prop["title"][0]["plain_text"]
cafes_list.append(name)
except Exception as e:
print(f"Error querying Cafes database: {e}", file=sys.stderr)
return False
# Required headings and their types
required_headings = [
("🎒 Perfect Weekend Adventure", "heading_1"),
("🏖️ Beach Activities", "heading_2"),
("🍽️ Cultural Dining Experience", "heading_2"),
("☕ Coffee Break Spots", "heading_2"),
("📊 Weekend Summary", "heading_2")
]
# Track verification results
found_headings = set()
found_beach_list = False
found_restaurant_list = False
found_toggle_with_cafes = False
found_summary = False
found_divider = False
found_callout = False
# Variables to track counts
beach_count = 0
restaurant_count = 0
cafe_count = 0
current_section = None
is_in_toggle = False
for block in all_blocks:
block_type = block.get("type")
block_text = notion_utils.get_block_plain_text(block)
# Check headings
for heading_text, expected_type in required_headings:
if heading_text in block_text and block_type == expected_type:
found_headings.add(heading_text)
current_section = heading_text
# Check Beach Activities section
if current_section == "🏖️ Beach Activities" and block_type == "bulleted_list_item":
found_beach_list = True
beach_count += 1
# Verify format includes name and potentially URL
for activity in beach_activities:
if activity["name"] in block_text:
if activity["url"] and activity["url"] not in block_text:
print(f"Warning: Beach activity '{activity['name']}' missing URL", file=sys.stderr)
# Check Cultural Dining section
elif current_section == "🍽️ Cultural Dining Experience" and block_type == "numbered_list_item":
found_restaurant_list = True
restaurant_count += 1
# Check format: Restaurant Name (Tag: [tag])
for restaurant in cultural_restaurants:
if restaurant["name"] in block_text and f"Tag: {restaurant['tag']}" in block_text:
pass # Format is correct
# Check Coffee Break Spots section
elif current_section == "☕ Coffee Break Spots":
if block_type == "toggle" and "Top Cafes to Visit" in block_text:
is_in_toggle = True
found_toggle_with_cafes = True
elif is_in_toggle and block_type == "to_do":
cafe_count += 1
# Verify unchecked status
to_do_data = block.get("to_do", {})
if to_do_data.get("checked", False):
print(f"Error: Cafe to-do item should be unchecked: {block_text}", file=sys.stderr)
return False
elif block_type in ["heading_1", "heading_2", "heading_3"]:
is_in_toggle = False
# Check Weekend Summary section
elif current_section == "📊 Weekend Summary" and block_type == "paragraph":
expected_text = f"This weekend includes {len(beach_activities)} beach activities, {len(cultural_restaurants)} cultural dining options, and {len(cafes_list)} coffee spots to explore!"
if expected_text in block_text:
found_summary = True
# Check for divider after summary
if block_type == "divider":
found_divider = True
# Check for callout with pro tip
if block_type == "callout":
callout_data = block.get("callout", {})
icon = callout_data.get("icon", {})
if icon.get("type") == "emoji" and icon.get("emoji") == "💡":
if "Pro tip: Check the Seasons database for the best time to enjoy outdoor activities!" in block_text:
found_callout = True
# Verify all required elements
all_passed = True
# Check all headings are present
for heading_text, _ in required_headings:
if heading_text not in found_headings:
print(f"Error: Missing required heading: {heading_text}", file=sys.stderr)
all_passed = False
# Check beach activities list
if not found_beach_list:
print("Error: Beach activities bulleted list not found", file=sys.stderr)
all_passed = False
elif beach_count != len(beach_activities):
print(f"Error: Expected {len(beach_activities)} beach activities, found {beach_count}", file=sys.stderr)
all_passed = False
# Check restaurant list
if not found_restaurant_list:
print("Error: Cultural dining numbered list not found", file=sys.stderr)
all_passed = False
elif restaurant_count != len(cultural_restaurants):
print(f"Error: Expected {len(cultural_restaurants)} cultural restaurants, found {restaurant_count}", file=sys.stderr)
all_passed = False
# Check cafes toggle
if not found_toggle_with_cafes:
print("Error: Toggle block 'Top Cafes to Visit' not found", file=sys.stderr)
all_passed = False
elif cafe_count != len(cafes_list):
print(f"Error: Expected {len(cafes_list)} cafes, found {cafe_count}", file=sys.stderr)
all_passed = False
# Check summary
if not found_summary:
print("Error: Weekend summary with correct counts not found", file=sys.stderr)
all_passed = False
# Check divider
if not found_divider:
print("Error: Divider block not found after summary", file=sys.stderr)
all_passed = False
# Check callout
if not found_callout:
print("Error: Callout with pro tip not found", file=sys.stderr)
all_passed = False
if all_passed:
print(f"Success: Perfect Weekend Adventure page created with all required elements.")
print(f"- {len(beach_activities)} beach activities")
print(f"- {len(cultural_restaurants)} cultural dining options")
print(f"- {len(cafes_list)} coffee spots")
return True
else:
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
notion = notion_utils.get_notion_client()
main_id = sys.argv[1] if len(sys.argv) > 1 else None
if verify(notion, main_id):
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright/easy/.gitkeep
================================================
================================================
FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/description.md
================================================
# Cloudflare Turnstile Authentication Challenge Task
Use Playwright MCP tools to complete Cloudflare Turnstile authentication challenge.
## Requirements:
1. Navigate to https://eval-web.mcpmark.ai/auth/turnstile
2. Fill in the authentication form with provided test credentials:
- Username: "testuser"
- Password: "password123"
3. Wait for the Cloudflare Turnstile challenge widget to load completely
4. Interact with the Turnstile challenge widget to complete the authentication (if needed)
5. Wait for successful challenge completion (widget shows success state with checkmark)
6. Submit the form by clicking the "Sign In" button
7. Wait for and capture any success message or confirmation that appears
8. Output the success capture in step 7
## Notes:
- Use the provided test credentials: testuser / password123
- Page shows success message inline, does not redirect to separate success page
- Wait for all UI state changes before proceeding to next step
- Verify both Turnstile completion and form submission success
================================================
FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/meta.json
================================================
{
"task_id": "cloudflare_turnstile_challenge",
"task_name": "Cloudflare Turnstile Challenge",
"category_id": "eval_web",
"category_name": "Eval Web",
"description": "Navigate websites with Cloudflare Turnstile protection, handle security challenges, bypass bot detection mechanisms, and successfully access protected content using automated browser interactions.",
"author": "Allison Zhan",
"created_at": "2025-07-27",
"difficulty": "L3",
"tags": [
"user interaction"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/mcpmark-eval-website.mp4",
"stateOriginalUrl": "https://mcp-eval-website.vercel.app/auth/turnstile"
}
}
================================================
FILE: tasks/playwright/standard/eval_web/cloudflare_turnstile_challenge/verify.py
================================================
#!/usr/bin/env python3
"""
Simplified verification script for Playwright Cloudflare Turnstile authentication task.
This script only verifies that the model successfully reported capturing the expected
success message by checking the last assistant message in messages.json.
"""
import sys
import json
import os
# Expected success message that agent should capture
EXPECTED_SUCCESS_MESSAGE = "Authentication successful! Security challenge verified."
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, 'r') as f:
messages = json.load(f)
# Find the last assistant message with status completed
for message in reversed(messages):
if (message.get('role') == 'assistant' and
message.get('status') == 'completed' and
message.get('type') == 'message'):
content = message.get('content', [])
# Extract text from content
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get('type') in ['text', 'output_text']:
return item.get('text', '')
elif isinstance(content, str):
return content
print("Warning: No completed assistant message found", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def verify():
"""
Verifies that the model's last response contains the expected success message.
"""
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if not model_response:
print("No model response found", file=sys.stderr)
return False
print(f"\nModel response (first 500 chars): {model_response[:500]}...", file=sys.stderr)
# Check if the expected success message is in the model's response
if EXPECTED_SUCCESS_MESSAGE in model_response:
print(f"\n✓ Success message found: '{EXPECTED_SUCCESS_MESSAGE}'", file=sys.stderr)
return True
else:
print(f"\n✗ Success message NOT found: '{EXPECTED_SUCCESS_MESSAGE}'", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = verify()
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright/standard/eval_web/extraction_table/data.csv
================================================
Title, Rating, Likes, Views, Replies
React 18 New Features Deep Dive, "4.8", 856, 12543, 89
Vue 3 Composition API in Practice, "4.5", 743, 9876, 67
Advanced TypeScript Types Guide, "4.9", 924, 15432, 102
Node.js Performance Optimization, "4.2", 567, 8765, 45
Frontend Engineering Best Practices, "4.7", 812, 11234, 78
Microservices Architecture Patterns, "4.3", 634, 9543, 56
Docker Containerization Deployment, "4.6", 789, 10876, 71
Kubernetes Cluster Management, "4.4", 698, 9234, 63
GraphQL API Design Principles, "4.8", 876, 13456, 94
Webpack 5 Configuration Guide, "4.1", 523, 7654, 38
Vite Build Tool Usage, "4.5", 745, 10123, 69
ESLint Code Standards, "4.7", 823, 11567, 82
Unit Testing Best Practices, "4.3", 612, 8934, 51
Performance Monitoring & Optimization, "4.9", 945, 16234, 108
Security Protection Strategies, "4.2", 578, 8456, 47
Database Design Principles, "4.6", 767, 10567, 73
Caching Strategies Implementation, "4.4", 689, 9123, 61
Message Queue Applications, "4.8", 834, 12876, 87
Distributed Systems Design, "4.0", 456, 6789, 34
Cloud Native Development, "4.5", 723, 9876, 65
DevOps Process Optimization, "4.7", 801, 11234, 79
Machine Learning Introduction, "4.1", 534, 7543, 41
Artificial Intelligence Applications, "4.6", 778, 10456, 74
Blockchain Technology Fundamentals, "4.3", 645, 8765, 53
Mobile Development Techniques, "4.9", 912, 14567, 97
Cross-Platform Solutions, "4.2", 589, 8234, 48
Progressive Web App Development, "4.8", 867, 12345, 91
Web3 Development Guide, "4.4", 712, 9567, 64
NFT Smart Contracts, "4.5", 756, 10234, 70
DeFi Protocol Design, "4.7", 834, 11876, 83
Game Engine Development, "4.3", 623, 8567, 52
3D Graphics Rendering, "4.6", 789, 10678, 75
Audio Video Processing, "4.1", 545, 7234, 42
IoT Applications, "4.8", 856, 12567, 88
Edge Computing Practices, "4.2", 567, 8345, 46
5G Network Technology, "4.9", 923, 15123, 103
Quantum Computing Principles, "4.4", 678, 9345, 62
Bioinformatics Analysis, "4.5", 734, 9876, 68
Data Science Methods, "4.7", 812, 11456, 80
Algorithms and Data Structures, "4.3", 634, 8678, 54
System Design Interview, "4.6", 778, 10345, 76
Code Refactoring Techniques, "4.8", 845, 12234, 89
Open Source Contributions, "4.2", 556, 7890, 43
Technical Team Management, "4.5", 723, 9567, 66
Product Thinking Development, "4.9", 901, 14234, 95
User Experience Design, "4.1", 512, 7123, 39
Interface Interaction Optimization, "4.7", 789, 10890, 77
Accessibility Design, "4.4", 667, 8901, 58
SEO Optimization Strategies, "4.6", 756, 10123, 72
Social Media Operations, "4.3", 623, 8456, 55
Serverless Architecture, "4.7", 834, 11234, 81
API Gateway Design, "4.2", 567, 8765, 49
Microservice Communication, "4.8", 892, 13567, 95
Event-Driven Architecture, "4.5", 723, 9876, 67
CQRS Pattern Implementation, "4.3", 645, 8234, 54
Domain-Driven Design, "4.6", 778, 10456, 73
Clean Architecture Principles, "4.4", 689, 9123, 62
Hexagonal Architecture, "4.1", 534, 7543, 42
Onion Architecture, "4.5", 712, 9567, 65
Event Sourcing Patterns, "4.7", 823, 11876, 79
Saga Pattern for Distributed Systems, "4.3", 612, 8934, 53
Circuit Breaker Pattern, "4.8", 856, 12543, 87
Bulkhead Pattern, "4.2", 578, 8456, 47
Retry Pattern Implementation, "4.6", 767, 10567, 74
Timeout Pattern, "4.4", 698, 9234, 63
Rate Limiting Strategies, "4.9", 934, 15432, 103
Load Balancing Techniques, "4.1", 523, 7654, 39
Service Mesh Architecture, "4.5", 745, 10123, 69
Istio Service Mesh, "4.7", 812, 11567, 82
Envoy Proxy Configuration, "4.3", 634, 9543, 56
Consul Service Discovery, "4.6", 789, 10876, 71
Kubernetes Ingress, "4.4", 676, 9345, 58
Helm Chart Development, "4.8", 845, 12234, 89
Terraform Infrastructure, "4.2", 556, 7890, 44
Ansible Automation, "4.5", 723, 9567, 66
Jenkins Pipeline, "4.7", 801, 11234, 78
GitLab CI/CD, "4.3", 623, 8567, 52
GitHub Actions, "4.6", 789, 10678, 75
Azure DevOps, "4.1", 512, 7123, 41
AWS CodePipeline, "4.8", 867, 12345, 91
Docker Compose, "4.4", 712, 9567, 64
Kubernetes Operators, "4.5", 756, 10234, 70
Custom Resource Definitions, "4.7", 834, 11876, 83
Pod Security Policies, "4.3", 623, 8567, 52
Network Policies, "4.6", 789, 10678, 75
RBAC Configuration, "4.1", 545, 7234, 42
Secret Management, "4.8", 856, 12567, 88
ConfigMap Usage, "4.2", 567, 8345, 46
Persistent Volumes, "4.9", 923, 15123, 103
StatefulSets, "4.4", 678, 9345, 62
DaemonSets, "4.5", 734, 9876, 68
Jobs and CronJobs, "4.7", 812, 11456, 80
Horizontal Pod Autoscaler, "4.3", 634, 8678, 54
Vertical Pod Autoscaler, "4.6", 778, 10345, 76
Cluster Autoscaler, "4.8", 845, 12234, 89
Resource Quotas, "4.2", 556, 7890, 43
Limit Ranges, "4.5", 723, 9567, 66
================================================
FILE: tasks/playwright/standard/eval_web/extraction_table/description.md
================================================
# Web Data Extraction Task
Use Playwright MCP tools to extract all data from the specified website and present it in CSV format.
## Requirements:
1. Navigate to https://eval-web.mcpmark.ai/extraction
2. Wait for the page to fully load
3. Extract all data content from the page, including:
- Title
- Rating
- Likes
- Views
- Replies
4. Organize the extracted data into CSV format
5. Ensure data completeness and accuracy
6. Output ONLY the complete CSV formatted data (no additional text or explanations)
## CSV Data Example:
```csv
Title, Rating, Likes, Views, Replies
SEO Optimization, "4.6", 756, 10123, 72
Vue 3 Composition API, "4.5", 743, 9876, 67
Advanced TypeScript Types Guide, "4.9", 924, 15432, 102
Node.js Performance Optimization, "4.2", 567, 8765, 45
Frontend Engineering Best Practices, "4.7", 812, 11234, 78
```
## Notes:
- Ensure extraction of all visible data rows
- Maintain data format consistency
- All numeric data (Rating, Likes, Views, Replies) should NOT have quotes, only text data containing commas should be wrapped in quotes
- Wait for the page to fully load before starting data extraction
- Verify the quantity and format of extracted data are correct
- **IMPORTANT: Final output must contain ONLY CSV data - no explanatory text, descriptions, or other content**
================================================
FILE: tasks/playwright/standard/eval_web/extraction_table/meta.json
================================================
{
"task_id": "extraction_table",
"task_name": "Extraction Table",
"category_id": "eval_web",
"category_name": "Eval Web",
"description": "Extract structured data from complex web tables, parse multi-level headers, handle dynamic content loading, transform data formats, and export comprehensive datasets.",
"author": "Arvin Xu",
"created_at": "2025-08-18",
"difficulty": "L3",
"tags": [
"data extraction"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/mcpmark-eval-website.mp4",
"stateOriginalUrl": "https://eval-web.mcpmark.ai/extraction"
}
}
================================================
FILE: tasks/playwright/standard/eval_web/extraction_table/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for checking Playwright web data extraction tasks.
This script verifies whether the model successfully extracted CSV format data from web pages
by checking the last assistant message in messages.json.
"""
import sys
import json
import os
import re
import csv
from io import StringIO
# Expected CSV header (must match exactly, including spaces)
EXPECTED_HEADER_LINE = "Title, Rating, Likes, Views, Replies"
EXPECTED_HEADERS = ["Title", "Rating", "Likes", "Views", "Replies"]
# Exact number of data rows (must match data.csv exactly)
EXPECTED_DATA_ROWS = 97
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"| MCP_MESSAGES: {messages_path}")
if not messages_path:
print("| Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, 'r') as f:
messages = json.load(f)
# Find the last assistant message with status completed
for message in reversed(messages):
if (message.get('role') == 'assistant' and
message.get('status') == 'completed' and
message.get('type') == 'message'):
content = message.get('content', [])
# Extract text from content
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get('type') in ['text', 'output_text']:
return item.get('text', '')
elif isinstance(content, str):
return content
print("| Warning: No completed assistant message found", file=sys.stderr)
return None
except Exception as e:
print(f"| Error reading messages file: {str(e)}", file=sys.stderr)
return None
def extract_csv_from_response(response):
"""
Extract CSV data from model response.
"""
# Look for CSV code blocks
csv_pattern = r'```(?:csv)?\s*\n(.*?)\n```'
matches = re.findall(csv_pattern, response, re.DOTALL | re.IGNORECASE)
if matches:
return matches[-1].strip() # Return the last CSV block
# If no code block found, try to find CSV data starting with header
lines = response.split('\n')
csv_start = -1
# Stricter header matching: look for lines containing "Title" and "Rating"
for i, line in enumerate(lines):
if "Title" in line and "Rating" in line and "Likes" in line:
csv_start = i
break
if csv_start >= 0:
# Extract from header until empty line or non-CSV format line
csv_lines = []
for line in lines[csv_start:]:
line = line.strip()
if not line or not (',' in line):
if csv_lines: # If we already have data, stop at empty line
break
continue
csv_lines.append(line)
if len(csv_lines) > 100: # Prevent extracting too many rows
break
return '\n'.join(csv_lines)
return None
def validate_csv_data(csv_text):
"""
Validate CSV data format and content, must match data.csv exactly.
"""
if not csv_text:
return False, "CSV data not found"
try:
lines = csv_text.strip().split('\n')
# Check total number of rows (1 header row + data rows)
expected_total_rows = EXPECTED_DATA_ROWS + 1
if len(lines) != expected_total_rows:
return False, f"| CSV total row count mismatch, expected: {expected_total_rows} rows, actual: {len(lines)} rows"
# Check header row format (must match exactly)
header_line = lines[0].strip()
if header_line != EXPECTED_HEADER_LINE:
return False, f"| Header format mismatch, expected: '{EXPECTED_HEADER_LINE}', actual: '{header_line}'"
# Parse CSV to validate structure
csv_reader = csv.reader(StringIO(csv_text))
rows = list(csv_reader)
# Check column count for each row
expected_columns = len(EXPECTED_HEADERS)
for i, row in enumerate(rows):
if len(row) != expected_columns:
return False, f"| Row {i+1} column count incorrect, expected: {expected_columns} columns, actual: {len(row)} columns"
# Validate data row format
valid_rows = 0
for i, row in enumerate(rows[1:], 2): # Skip header, start from row 2
# Check if each column has data
if not all(cell.strip() for cell in row):
return False, f"| Row {i} contains empty data"
# Check numeric column format (Rating, Likes, Views, Replies should not have quotes)
for col_idx, col_name in [(1, "Rating"), (2, "Likes"), (3, "Views"), (4, "Replies")]:
value = row[col_idx].strip()
# Check for quotes (should not have any)
if value.startswith('"') and value.endswith('"'):
return False, f"| Row {i} {col_name} should not have quotes, actual: {value}"
# Check numeric format
if col_name == "Rating":
try:
float(value)
except ValueError:
return False, f"| Row {i} {col_name} should be a number, actual: {value}"
else:
if not value.isdigit():
return False, f"| Row {i} {col_name} should be pure digits, actual: {value}"
valid_rows += 1
# Validate number of data rows
if valid_rows != EXPECTED_DATA_ROWS:
return False, f"| Valid data row count mismatch, expected: {EXPECTED_DATA_ROWS} rows, actual: {valid_rows} rows"
return True, f"| CSV validation successful: format matches data.csv exactly, {valid_rows} valid data rows"
except Exception as e:
return False, f"| CSV format parsing error: {str(e)}"
def verify():
"""
Verify if the model's response contains correct CSV data extraction results.
"""
# Get model response
model_response = get_model_response()
if not model_response:
print("| Model response not found", file=sys.stderr)
return False
print(f"|\n| Model response (first 500 characters): {model_response[:500]}...", file=sys.stderr)
# Extract CSV data from response
csv_data = extract_csv_from_response(model_response)
if not csv_data:
print("|\n| ✗ CSV data not found in response", file=sys.stderr)
return False
print(f"|\n| Found CSV data (first 300 characters):\n| {csv_data[:300]}...", file=sys.stderr)
# Validate CSV data
is_valid, message = validate_csv_data(csv_data)
if is_valid:
print(f"|\n| ✓ {message}", file=sys.stderr)
return True
else:
print(f"|\n| ✗ CSV validation failed: {message}", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = verify()
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/description.md
================================================
# Web Search Task
Use Playwright MCP tools to search for information about the X profile https://x.com/arvin17x and find out when this person was born.
## Requirements:
Extract the answer in specific format:
- just year,like 1990, 2001
================================================
FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/meta.json
================================================
{
"task_id": "birth_of_arvinxu",
"task_name": "Birth Of Arvinxu",
"category_id": "web_search",
"category_name": "Web Search",
"description": "Search for biographical information about X profile arvin17x across multiple web sources, extract birth year data, verify information accuracy, and compile findings.",
"author": "Arvin Xu",
"created_at": "2025-08-18",
"difficulty": "L3",
"tags": [
"search aggregation",
"data extraction"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": null,
"stateContent": null,
"stateUrl": null,
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/playwright/standard/web_search/birth_of_arvinxu/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Playwright web search task.
Simple verification that checks if the AI agent found the correct answer.
The expected ground truth answer is configured at the top of the file.
"""
import sys
import json
import os
from pathlib import Path
from typing import Dict, Any
# =============================================================================
# CONFIGURATION
# =============================================================================
# Expected ground truth answer (exact match)
EXPECTED_GROUND_TRUTH = "1995"
# =============================================================================
# MCP RESULT PARSING
# =============================================================================
def get_working_directory() -> Path:
"""Get the working directory where messages.json should be."""
# Priority 1: Use MCP_MESSAGES path if available (most reliable)
messages_path = os.getenv("MCP_MESSAGES")
if messages_path and Path(messages_path).exists():
return Path(messages_path).parent.resolve()
# Priority 2: Use PLAYWRIGHT_WORK_DIR environment variable
work_dir = os.getenv("PLAYWRIGHT_WORK_DIR")
if work_dir:
work_path = Path(work_dir).resolve()
if (work_path / "messages.json").exists():
return work_path
# Priority 3: Check current directory (fallback)
current_dir = Path.cwd()
if (current_dir / "messages.json").exists():
return current_dir
# Priority 4: Default fallback
return Path(".").resolve()
def parse_ai_results(work_dir: Path) -> Dict[str, Any]:
"""Parse the AI agent's results from messages.json"""
messages_file = work_dir / "messages.json"
if not messages_file.exists():
return {"success": False, "error": "No messages.json found"}
try:
with open(messages_file, "r", encoding="utf-8") as f:
messages = json.load(f)
except (json.JSONDecodeError, IOError) as e:
return {"success": False, "error": f"Failed to read messages.json: {e}"}
# Look for expected answer in the AI's responses
found_answer = False
ai_responses = []
for message in messages:
if message.get("role") == "assistant":
content = str(message.get("content", ""))
# Handle both string and list content formats
if isinstance(message.get("content"), list):
content = " ".join(
item.get("text", "") if isinstance(item, dict) else str(item)
for item in message.get("content", [])
)
ai_responses.append(content)
# Exact match (character-for-character, case-sensitive, no trimming)
if content == EXPECTED_GROUND_TRUTH:
found_answer = True
return {
"success": True,
"found_answer": found_answer,
"ai_responses": ai_responses,
"total_responses": len(ai_responses),
}
# =============================================================================
# MAIN VERIFICATION
# =============================================================================
def verify_task() -> bool:
"""Verify the AI agent found the correct answer"""
# Parse AI agent results
work_dir = get_working_directory()
print(f"| Working directory: {work_dir}")
ai_results = parse_ai_results(work_dir)
if not ai_results["success"]:
print(f"| ❌ Could not parse AI results: {ai_results.get('error')}")
return False
if ai_results["found_answer"]:
print(f"| AI agent correctly identified: {EXPECTED_GROUND_TRUTH}")
return True
else:
print(f"| AI agent did not find the correct answer: {EXPECTED_GROUND_TRUTH}")
return False
def main():
"""Main verification function."""
try:
success = verify_task()
sys.exit(0 if success else 1)
except Exception as e:
print(f"\n💥 Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright/standard/web_search/r1_arxiv/content.txt
================================================
In this work, we share our journey in enhancing model reasoning abilities through reinforcement learning. DeepSeek-R1-Zero represents a pure RL approach without relying on cold-start data, achieving strong performance across various tasks. DeepSeek-R1 is more powerful, leveraging cold-start data alongside iterative RL fine-tuning. Ultimately, DeepSeek-R1 achieves performance comparable to OpenAI-o1-1217 on a range of tasks.
We further explore distillation the reasoning capability to small dense models. We use DeepSeek-R1 as the teacher model to generate 800K training samples, and fine-tune several small dense models. The results are promising: DeepSeek-R1-Distill-Qwen-1.5B outperforms GPT-4o and Claude-3.5-Sonnet on math benchmarks with 28.9% on AIME and 83.9% on MATH. Other dense models also achieve impressive results, significantly outperforming other instruction-tuned models based on the same underlying checkpoints.
In the future, we plan to invest in research across the following directions for DeepSeek-R1.
- **General Capability**: Currently, the capabilities of DeepSeek-R1 fall short of DeepSeek-V3 in tasks such as function calling, multi-turn, complex role-playing, and JSON output. Moving forward, we plan to explore how long CoT can be leveraged to enhance tasks in these fields.
- **Language Mixing**: DeepSeek-R1 is currently optimized for Chinese and English, which may result in language mixing issues when handling queries in other languages. For instance, DeepSeek-R1 might use English for reasoning and responses, even if the query is in a language other than English or Chinese. We aim to address this limitation in future updates.
- **Prompting Engineering**: When evaluating DeepSeek-R1, we observe that it is sensitive to prompts. Few-shot prompting consistently degrades its performance. Therefore, we recommend users directly describe the problem and specify the output format using a zero-shot setting for optimal results.
- **Software Engineering Tasks**: Due to the long evaluation times, which impact the efficiency of the RL process, large-scale RL has not been applied extensively in software engineering tasks. As a result, DeepSeek-R1 has not demonstrated a huge improvement over DeepSeek-V3 on software engineering benchmarks. Future versions will address this by implementing rejection sampling on software engineering data or incorporating asynchronous evaluations during the RL process to improve efficiency.
================================================
FILE: tasks/playwright/standard/web_search/r1_arxiv/description.md
================================================
# Web Search Task
Use Playwright MCP tools to search for the DeepSeek R1 research paper and extract all the paragraphs of the Conclusion section.
## Requirements:
1. Search for the DeepSeek R1 research paper
2. Navigate to the paper and find the Conclusion section
3. Extract **ALL the paragraphs** of the Conclusion section
4. **Provide the content in Markdown format - no explanations, no additional text**
## Important Notes:
- **Output ALL the paragraphs of text**
- **Do NOT include any explanations, summaries, or additional content**
- **The response should contain ONLY the Conclusion section content formatted in Markdown**
## Expected Output:
All the paragraphs of the Conclusion section from the DeepSeek R1 paper, formatted in Markdown with proper paragraph structure and formatting.
================================================
FILE: tasks/playwright/standard/web_search/r1_arxiv/meta.json
================================================
{
"task_id": "r1_arxiv",
"task_name": "R1 Arxiv",
"category_id": "web_search",
"category_name": "Web Search",
"description": "Search arXiv for R1 model research papers, extract technical specifications, analyze methodology sections, compile research findings, and generate comprehensive literature review.",
"author": "Arvin Xu",
"created_at": "2025-08-18",
"difficulty": "L3",
"tags": [
"search aggregation",
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": null,
"stateContent": null,
"stateUrl": null,
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/playwright/standard/web_search/r1_arxiv/verify.py
================================================
#!/usr/bin/env python3
"""
Verification script for Playwright web search task.
Simple verification that checks if the AI agent found the correct Introduction content.
The expected ground truth answer is configured at the top of the file.
"""
import sys
import json
import os
from pathlib import Path
from typing import Dict, Any
# =============================================================================
# CONFIGURATION
# =============================================================================
# Expected ground truth content from content.txt
EXPECTED_CONTENT_FILE = "content.txt"
# =============================================================================
# MCP RESULT PARSING
# =============================================================================
def get_working_directory() -> Path:
"""Get the working directory where messages.json should be."""
# Priority 1: Use MCP_MESSAGES path if available (most reliable)
messages_path = os.getenv("MCP_MESSAGES")
if messages_path and Path(messages_path).exists():
return Path(messages_path).parent.resolve()
# Priority 2: Use PLAYWRIGHT_WORK_DIR environment variable
work_dir = os.getenv("PLAYWRIGHT_WORK_DIR")
if work_dir:
work_path = Path(work_dir).resolve()
if (work_path / "messages.json").exists():
return work_path
# Priority 3: Check current directory (fallback)
current_dir = Path.cwd()
if (current_dir / "messages.json").exists():
return current_dir
# Priority 4: Default fallback
return Path(".").resolve()
def load_expected_content() -> str:
"""Load the expected content from content.txt"""
# content.txt is in the same directory as verify.py
current_file = Path(__file__).resolve()
content_file = current_file.parent / EXPECTED_CONTENT_FILE
if not content_file.exists():
print(f"| {EXPECTED_CONTENT_FILE} not found at: {content_file}")
return ""
print(f"| Found {EXPECTED_CONTENT_FILE} at: {content_file}")
try:
with open(content_file, "r", encoding="utf-8") as f:
return f.read().strip()
except (IOError, UnicodeDecodeError) as e:
print(f"| Warning: Could not read {content_file}: {e}")
return ""
def parse_ai_results(work_dir: Path) -> Dict[str, Any]:
"""Parse the AI agent's results from messages.json"""
messages_file = work_dir / "messages.json"
if not messages_file.exists():
return {"success": False, "error": "No messages.json found"}
try:
with open(messages_file, "r", encoding="utf-8") as f:
messages = json.load(f)
except (json.JSONDecodeError, IOError) as e:
return {"success": False, "error": f"Failed to read messages.json: {e}"}
# Look for extracted content in the AI's responses
found_content = False
ai_responses = []
extracted_content = ""
for message in messages:
if message.get("role") == "assistant":
content = str(message.get("content", ""))
# Handle both string and list content formats
if isinstance(message.get("content"), list):
content = " ".join(
item.get("text", "") if isinstance(item, dict) else str(item)
for item in message.get("content", [])
)
ai_responses.append(content)
# Store the last response as extracted content
extracted_content = content
return {
"success": True,
"found_content": True, # Assuming content was found if we have responses
"ai_responses": ai_responses,
"extracted_content": extracted_content,
"total_responses": len(ai_responses),
}
def compare_content(extracted: str, expected: str) -> Dict[str, Any]:
"""Compare extracted content with expected content"""
if not expected:
return {"success": False, "error": "No expected content to compare against"}
if not extracted:
return {"success": False, "error": "No extracted content found"}
# Normalize content for comparison (remove extra whitespace, normalize line breaks)
extracted_normalized = " ".join(extracted.split())
expected_normalized = " ".join(expected.split())
# Direct text comparison - content must be exactly the same
is_exact_match = extracted_normalized == expected_normalized
return {
"success": True,
"is_exact_match": is_exact_match,
"extracted_length": len(extracted_normalized),
"expected_length": len(expected_normalized),
"extracted_preview": extracted_normalized[:100] + "..." if len(extracted_normalized) > 100 else extracted_normalized,
"expected_preview": expected_normalized[:100] + "..." if len(expected_normalized) > 100 else expected_normalized
}
# =============================================================================
# MAIN VERIFICATION
# =============================================================================
def verify_task(work_dir: Path) -> bool:
"""Verify the AI agent found the correct Introduction content"""
print("| Verifying Playwright Web Search Task - DeepSeek R1 Introduction")
print("| " + "=" * 70)
# Load expected content
print("| Loading expected content...")
expected_content = load_expected_content()
if not expected_content:
print("| Error: Could not load expected content")
return False
print(f"| Expected content loaded ({len(expected_content)} characters)")
# Parse MCP messages
messages = parse_ai_results(work_dir)
if not messages["success"]:
print(f"| Error: Could not parse AI results: {messages.get('error')}")
return False
# Extract AI agent response
extracted_content = messages.get("extracted_content", "")
if not extracted_content:
print("| Error: No AI agent response found")
return False
print(f"| Extracted content: {len(extracted_content)} characters")
# Compare content
print("| Comparing extracted content with expected content...")
comparison = compare_content(extracted_content, expected_content)
if not comparison["success"]:
print(f"| Comparison failed: {comparison.get('error')}")
return False
print(f"| Content comparison results:")
print(f"| - Extracted length: {comparison['extracted_length']} characters")
print(f"| - Expected length: {comparison['expected_length']} characters")
print(f"| - Extracted preview: {comparison['extracted_preview']}")
print(f"| - Expected preview: {comparison['expected_preview']}")
if comparison['is_exact_match']:
print("| Task completed successfully! Content matches exactly.")
return True
else:
print("| Task verification failed. Content does not match exactly.")
return False
def main():
"""Main verification function"""
print("| Starting verification...")
# Get working directory
work_dir = get_working_directory()
print(f"| Working directory: {work_dir}")
# Run verification
success = verify_task(work_dir)
if success:
sys.exit(0)
else:
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/.gitkeep
================================================
================================================
FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/description.md
================================================
Deliver a bite-sized AI community snapshot by capturing two essential metrics and posting them back to the forum.
**Task Requirements:**
1. Register and log in with username `AIDataAnalyst2025` and password `SecurePass123!` so every action is tied to that account.
2. Visit the `deeplearning` forum (page 1 only) and count how many posts are visible; note the exact total.
3. Go to the `MachineLearning` forum, find the post titled `[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)`, and record the vote count shown in the listing.
4. Still in `MachineLearning`, create a post titled `MachineLearning_Extraction` whose body is exactly the two-line Markdown list below (keep the keys and pipe separators untouched, replacing each `FILL_VALUE` with your numbers):
```
- Deeplearning_Post_Count|FILL_VALUE
- ChatGPT_Tool_Vote_Count|FILL_VALUE
```
No searches or multi-page navigation are required—just capture these two data points and report them accurately.
================================================
FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/label.txt
================================================
- Deeplearning_Post_Count|25
- ChatGPT_Tool_Vote_Count|2655
================================================
FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/meta.json
================================================
{
"task_id": "ai_data_analyst_easy",
"task_name": "AI Data Analyst (Easy)",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Capture a lightweight AI/ML engagement snapshot by signing in, grabbing three key metrics, and posting them back to the MachineLearning forum.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"user interaction",
"data extraction",
"comparative analysis"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/reddit/ai_data_analyst/verify.py
================================================
import asyncio
import os
import re
import sys
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "AIDataAnalyst2025"
PASSWORD = "SecurePass123!"
POST_TITLE = "MachineLearning_Extraction"
REQUIRED_FIELDS = [
"Deeplearning_Post_Count",
"ChatGPT_Tool_Vote_Count",
]
LABEL_PATH = Path(__file__).parent / "label.txt"
def parse_key_value_format(text: str) -> dict:
data = {}
if not text:
return data
for line in text.splitlines():
line = line.strip()
if not line or "|" not in line:
continue
line = re.sub(r"^[-•*]\s*", "", line)
key, value = line.split("|", 1)
data[key.strip()] = value.strip()
return data
def load_expected_values() -> dict:
if not LABEL_PATH.exists():
return {}
return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8"))
async def ensure_logged_in(page) -> bool:
print("Step 1: Ensuring we are logged in...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
user_button = page.locator(f'button:has-text("{USERNAME}")')
if await user_button.count():
print("✓ Already logged in", file=sys.stderr)
return True
login_link = page.locator('a:has-text("Log in")')
if not await login_link.count():
print("FAILED: Login link not found", file=sys.stderr)
return False
await login_link.click()
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', USERNAME)
await page.fill('input[name="_password"]', PASSWORD)
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
if await page.locator(f'button:has-text("{USERNAME}")').count():
print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
return True
print("FAILED: Could not log in with provided credentials", file=sys.stderr)
return False
async def fetch_submission_content(page):
print("Step 2: Retrieving MachineLearning submission...", file=sys.stderr)
await page.goto(f"{BASE_URL}/f/MachineLearning", wait_until="networkidle")
post_link = page.locator(f'a:has-text("{POST_TITLE}")')
if not await post_link.count():
print(
f"FAILED: Submission '{POST_TITLE}' not found in MachineLearning forum",
file=sys.stderr,
)
return None
await post_link.first.click()
await page.wait_for_load_state("networkidle")
selectors = [
".submission__body",
"article",
".post-body",
".RichText",
'[class*="RichText"]',
]
for selector in selectors:
locator = page.locator(selector)
if await locator.count():
content = await locator.first.inner_text()
if content:
print(f"✓ Found submission body via selector {selector}", file=sys.stderr)
return content
print("FAILED: Unable to locate submission body content", file=sys.stderr)
return None
def validate_submission(extracted: dict, expected: dict) -> bool:
missing = [key for key in REQUIRED_FIELDS if key not in extracted]
if missing:
print(
f"FAILED: Submission body missing required keys: {', '.join(missing)}",
file=sys.stderr,
)
return False
errors = []
for key in REQUIRED_FIELDS:
actual = extracted.get(key, "")
expect = expected.get(key, "")
try:
actual_val = int(actual)
expect_val = int(expect)
if actual_val != expect_val:
errors.append(f"{key}: expected {expect_val}, found {actual_val}")
except ValueError:
errors.append(f"{key}: value '{actual}' is not numeric")
if errors:
print("FAILED: Submission values do not match expected data:", file=sys.stderr)
for err in errors:
print(f" - {err}", file=sys.stderr)
return False
print("✓ Submission content matches expected metrics", file=sys.stderr)
return True
async def verify() -> bool:
expected = load_expected_values()
if not expected:
print("FAILED: label.txt with expected values is missing", file=sys.stderr)
return False
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
if not await ensure_logged_in(page):
return False
content = await fetch_submission_content(page)
if not content:
return False
extracted = parse_key_value_format(content)
if not validate_submission(extracted, expected):
return False
print("SUCCESS: Easy AI data analyst task verified", file=sys.stderr)
return True
except PlaywrightTimeoutError as exc:
print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
return False
except Exception as exc:
print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/description.md
================================================
Produce a five-line GPT activity bulletin so readers can quickly see how active the MachineLearning forum is.
**Task Requirements:**
1. Register and log in with username `llm_analyst_2024` and password `LLMExpert2024!`.
2. On the first page of the `MachineLearning` forum, count how many post titles contain `GPT`, `ChatGPT`, or `LLM`; record the exact total.
3. From that subset, identify the single post with the highest upvote count and note its full title and vote total.
4. Create a new post titled `LLM Research Summary: GPT Discussions Analysis [2024]` in the same forum. The body must be exactly the Markdown list below with your numbers in place of each `FILL_VALUE` (keep the keys, order, and pipe separator intact):
```
- Total_LLM_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Upvotes|FILL_VALUE
```
That’s the entire easy version—just report the headline LLM count and the hottest related thread.
================================================
FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/label.txt
================================================
- Total_LLM_Posts|9
- Top1_Title|[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)
- Top1_Upvotes|2655
================================================
FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/meta.json
================================================
{
"task_id": "llm_research_summary_easy",
"task_name": "LLM Research Summary (Easy)",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Collect the headline GPT metrics from MachineLearning and publish a short five-line recap.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data extraction",
"search aggregation",
"content submission",
"user interaction"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/reddit/llm_research_summary/verify.py
================================================
import asyncio
import os
import re
import sys
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "llm_analyst_2024"
PASSWORD = "LLMExpert2024!"
FORUM_SLUG = "MachineLearning"
POST_TITLE = "LLM Research Summary: GPT Discussions Analysis [2024]"
REQUIRED_FIELDS = [
"Total_LLM_Posts",
"Top1_Title",
"Top1_Upvotes",
]
NUMERIC_FIELDS = {"Total_LLM_Posts", "Top1_Upvotes"}
LABEL_PATH = Path(__file__).parent / "label.txt"
def parse_key_value_format(text: str) -> dict:
data = {}
if not text:
return data
for line in text.splitlines():
line = line.strip()
if not line or "|" not in line:
continue
line = re.sub(r"^[-•*]\s*", "", line)
key, value = line.split("|", 1)
data[key.strip()] = value.strip()
return data
def normalize_text(value: str) -> str:
if value is None:
return ""
replacements = {
"\u2019": "'",
"\u2018": "'",
"\u201c": '"',
"\u201d": '"',
}
for src, dst in replacements.items():
value = value.replace(src, dst)
return " ".join(value.split()).strip()
def load_expected_values() -> dict:
if not LABEL_PATH.exists():
return {}
return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8"))
async def ensure_logged_in(page) -> bool:
print("Step 1: Signing in as llm_analyst_2024...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
user_button = page.locator(f'button:has-text("{USERNAME}")')
if await user_button.count():
print("✓ Already logged in", file=sys.stderr)
return True
login_link = page.locator('a:has-text("Log in")')
if not await login_link.count():
print("FAILED: Login link not found", file=sys.stderr)
return False
await login_link.click()
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', USERNAME)
await page.fill('input[name="_password"]', PASSWORD)
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
if await page.locator(f'button:has-text("{USERNAME}")').count():
print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
return True
print("FAILED: Could not log in with provided credentials", file=sys.stderr)
return False
async def fetch_summary_body(page):
print("Step 2: Opening MachineLearning summary post...", file=sys.stderr)
await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
post_link = page.locator(f'a:has-text("{POST_TITLE}")')
if not await post_link.count():
print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr)
return None
await post_link.first.click()
await page.wait_for_load_state("networkidle")
selectors = [
".submission__body",
"article",
".post-body",
".RichText",
'[class*="RichText"]',
'div:has-text("Total_LLM_Posts")',
]
for selector in selectors:
locator = page.locator(selector)
if await locator.count():
content = await locator.first.inner_text()
if content:
print(f"✓ Found summary content via selector {selector}", file=sys.stderr)
return content
print("FAILED: Unable to locate submission body", file=sys.stderr)
return None
def validate_fields(extracted: dict, expected: dict) -> bool:
missing = [key for key in REQUIRED_FIELDS if key not in extracted]
if missing:
print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr)
return False
errors = []
for key in REQUIRED_FIELDS:
actual = extracted.get(key, "")
expect = expected.get(key, "")
if key in NUMERIC_FIELDS:
try:
actual_val = int(actual)
expect_val = int(expect)
if actual_val != expect_val:
errors.append(f"{key}: expected {expect_val}, found {actual_val}")
except ValueError:
errors.append(f"{key}: '{actual}' is not numeric")
else:
if normalize_text(actual) != normalize_text(expect):
errors.append(f"{key}: expected '{expect}', found '{actual}'")
if errors:
print("FAILED: Summary values do not match expected data:", file=sys.stderr)
for err in errors:
print(f" - {err}", file=sys.stderr)
return False
print("✓ Summary values match expected snapshot", file=sys.stderr)
return True
async def verify() -> bool:
expected = load_expected_values()
if not expected:
print("FAILED: label.txt is missing", file=sys.stderr)
return False
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
if not await ensure_logged_in(page):
return False
content = await fetch_summary_body(page)
if not content:
return False
extracted = parse_key_value_format(content)
if not validate_fields(extracted, expected):
return False
print("SUCCESS: LLM research easy task verified", file=sys.stderr)
return True
except PlaywrightTimeoutError as exc:
print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
return False
except Exception as exc:
print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/description.md
================================================
Provide a lightweight status report on what’s trending in the movies forum so stakeholders can scan it at a glance.
**Task Requirements:**
1. Register and log in with username `movie_reviewer_2024` and password `movie_reviewer_2024`.
2. On the first page of the `movies` forum, count how many post titles contain any four-digit year (e.g., 1984, 2024) and record the total.
3. Still on that page, find the post with the highest upvote count and record its full title as well as the vote and comment counts shown.
4. Publish a post in the same forum titled `Wonderful Movies Analysis: Community Favorites [2024]`. The body must match the four-line Markdown list below—keep the keys, order, and pipe separators exactly as written while replacing each `FILL_VALUE` with your data:
```
- Total_Year_Posts|FILL_VALUE
- Top_Title|FILL_VALUE
- Top_Upvotes|FILL_VALUE
- Top_Comments|FILL_VALUE
```
No multi-page browsing or special threads are required; this easy task captures just the top signals from the first page.
================================================
FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/label.txt
================================================
- Total_Year_Posts|1
- Top_Title|Who will win the Oscar for ACTRESS IN A SUPPORTING ROLE?
- Top_Upvotes|9933
- Top_Comments|23
================================================
FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/meta.json
================================================
{
"task_id": "movie_reviewer_analysis_easy",
"task_name": "Movie Reviewer Analysis (Easy)",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Grab the first-page movie signals plus the Rittenhouse poster stats and share them in a concise recap post.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"user interaction",
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/reddit/movie_reviewer_analysis/verify.py
================================================
import asyncio
import os
import re
import sys
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "movie_reviewer_2024"
PASSWORD = "movie_reviewer_2024"
FORUM_SLUG = "movies"
POST_TITLE = "Wonderful Movies Analysis: Community Favorites [2024]"
REQUIRED_FIELDS = [
"Total_Year_Posts",
"Top_Title",
"Top_Upvotes",
"Top_Comments",
]
NUMERIC_FIELDS = {
"Total_Year_Posts",
"Top_Upvotes",
"Top_Comments",
}
LABEL_PATH = Path(__file__).parent / "label.txt"
def parse_key_value_format(text: str) -> dict:
data = {}
if not text:
return data
for line in text.splitlines():
line = line.strip()
if not line or "|" not in line:
continue
line = re.sub(r"^[-•*]\s*", "", line)
key, value = line.split("|", 1)
data[key.strip()] = value.strip()
return data
def normalize_text(value: str) -> str:
if value is None:
return ""
replacements = {
"\u2019": "'",
"\u2018": "'",
"\u201c": '"',
"\u201d": '"',
}
for src, dst in replacements.items():
value = value.replace(src, dst)
return " ".join(value.split()).strip()
def load_expected_values() -> dict:
if not LABEL_PATH.exists():
return {}
return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8"))
async def ensure_logged_in(page) -> bool:
print("Step 1: Authenticating movie_reviewer_2024...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
user_button = page.locator(f'button:has-text("{USERNAME}")')
if await user_button.count():
print("✓ Already logged in", file=sys.stderr)
return True
login_link = page.locator('a:has-text("Log in")')
if not await login_link.count():
print("FAILED: Login link not found", file=sys.stderr)
return False
await login_link.click()
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', USERNAME)
await page.fill('input[name="_password"]', PASSWORD)
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
if await page.locator(f'button:has-text("{USERNAME}")').count():
print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
return True
print("FAILED: Could not log in with provided credentials", file=sys.stderr)
return False
async def fetch_summary_body(page):
print("Step 2: Locating the movies summary post...", file=sys.stderr)
await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
post_link = page.locator(f'a:has-text("{POST_TITLE}")')
if not await post_link.count():
print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr)
return None
await post_link.first.click()
await page.wait_for_load_state("networkidle")
selectors = [
".submission__body",
"article",
".post-body",
".RichText",
'[class*="RichText"]',
'div:has-text("Total_Year_Posts")',
]
for selector in selectors:
locator = page.locator(selector)
if await locator.count():
content = await locator.first.inner_text()
if content:
print(f"✓ Retrieved summary content via selector {selector}", file=sys.stderr)
return content
print("FAILED: Unable to locate submission body", file=sys.stderr)
return None
def validate_summary(extracted: dict, expected: dict) -> bool:
missing = [key for key in REQUIRED_FIELDS if key not in extracted]
if missing:
print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr)
return False
errors = []
for key in REQUIRED_FIELDS:
actual = extracted.get(key, "")
expect = expected.get(key, "")
if key in NUMERIC_FIELDS:
try:
actual_val = int(actual)
expect_val = int(expect)
if actual_val != expect_val:
errors.append(f"{key}: expected {expect_val}, found {actual_val}")
except ValueError:
errors.append(f"{key}: '{actual}' is not numeric")
else:
if normalize_text(actual) != normalize_text(expect):
errors.append(f"{key}: expected '{expect}', found '{actual}'")
if errors:
print("FAILED: Summary values differ from expected snapshot:", file=sys.stderr)
for err in errors:
print(f" - {err}", file=sys.stderr)
return False
print("✓ Summary values match expected data", file=sys.stderr)
return True
async def verify() -> bool:
expected = load_expected_values()
if not expected:
print("FAILED: label.txt is missing", file=sys.stderr)
return False
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
if not await ensure_logged_in(page):
return False
content = await fetch_summary_body(page)
if not content:
return False
extracted = parse_key_value_format(content)
if not validate_summary(extracted, expected):
return False
print("SUCCESS: Movie analysis easy task verified", file=sys.stderr)
return True
except PlaywrightTimeoutError as exc:
print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
return False
except Exception as exc:
print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/description.md
================================================
Share a five-line NBA engagement brief so sports moderators can instantly see which thread is dominating.
**Task Requirements:**
1. Register and log in with username `NBA_DataAnalyst_2024` and password `Research#2024!`.
2. In the `sports` forum (or via the site search for “NBA”), locate the NBA-related post with the highest upvote count you can see. Record its exact title, vote total, comment count, and author username.
3. Count how many distinct NBA-titled posts you reviewed for this summary (at minimum, include the post from step 2) and store that total as `Total_NBA_Posts`.
4. Publish a post titled `Statistical Analysis: NBA Content Engagement on This Forum` in the same forum. Its body must consist solely of the Markdown list below; keep the keys/order/pipes untouched while replacing each `FILL_VALUE` with your numbers:
```
- Total_NBA_Posts|FILL_VALUE
- Top_Title|FILL_VALUE
- Top_Votes|FILL_VALUE
- Top_Comments|FILL_VALUE
- Top_Author|FILL_VALUE
```
This easy edition just reports the leading NBA thread plus the count of posts you reviewed—no deeper profile checks are necessary.
================================================
FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/label.txt
================================================
- Total_NBA_Posts|20
- Top_Title|Hamby claims [WNBA Champ] Aces 'unprofessional' after trade
- Top_Votes|614
- Top_Comments|170
- Top_Author|Responsible-Lunch815
================================================
FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/meta.json
================================================
{
"task_id": "nba_statistics_analysis_easy",
"task_name": "NBA Statistics Analysis (Easy)",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Summarize just the three strongest NBA threads and share their vote/comment stats in a short post.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"user interaction",
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/reddit/nba_statistics_analysis/verify.py
================================================
import asyncio
import os
import re
import sys
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "NBA_DataAnalyst_2024"
PASSWORD = "Research#2024!"
FORUM_SLUG = "sports"
POST_TITLE = "Statistical Analysis: NBA Content Engagement on This Forum"
REQUIRED_FIELDS = [
"Total_NBA_Posts",
"Top_Title",
"Top_Votes",
"Top_Comments",
"Top_Author",
]
NUMERIC_FIELDS = {
"Total_NBA_Posts",
"Top_Votes",
"Top_Comments",
}
LABEL_PATH = Path(__file__).parent / "label.txt"
def parse_key_value_format(text: str) -> dict:
data = {}
if not text:
return data
for line in text.splitlines():
line = line.strip()
if not line or "|" not in line:
continue
line = re.sub(r"^[-•*]\s*", "", line)
key, value = line.split("|", 1)
data[key.strip()] = value.strip()
return data
def normalize_text(value: str) -> str:
if value is None:
return ""
replacements = {
"\u2019": "'",
"\u2018": "'",
"\u201c": '"',
"\u201d": '"',
}
for src, dst in replacements.items():
value = value.replace(src, dst)
return " ".join(value.split()).strip()
def load_expected_values() -> dict:
if not LABEL_PATH.exists():
return {}
return parse_key_value_format(LABEL_PATH.read_text(encoding="utf-8"))
async def ensure_logged_in(page) -> bool:
print("Step 1: Logging into the sports account...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
user_button = page.locator(f'button:has-text("{USERNAME}")')
if await user_button.count():
print("✓ Already logged in", file=sys.stderr)
return True
login_link = page.locator('a:has-text("Log in")')
if not await login_link.count():
print("FAILED: Login link not found", file=sys.stderr)
return False
await login_link.click()
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', USERNAME)
await page.fill('input[name="_password"]', PASSWORD)
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
if await page.locator(f'button:has-text("{USERNAME}")').count():
print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
return True
print("FAILED: Could not log in with provided credentials", file=sys.stderr)
return False
async def fetch_summary_body(page):
print("Step 2: Opening the NBA engagement summary post...", file=sys.stderr)
await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
post_link = page.locator(f'a:has-text("{POST_TITLE}")')
if not await post_link.count():
print(f"FAILED: Submission '{POST_TITLE}' not found", file=sys.stderr)
return None
await post_link.first.click()
await page.wait_for_load_state("networkidle")
selectors = [
".submission__body",
"article",
".post-body",
".RichText",
'[class*="RichText"]',
'div:has-text("Total_NBA_Posts")',
]
for selector in selectors:
locator = page.locator(selector)
if await locator.count():
content = await locator.first.inner_text()
if content:
print(f"✓ Retrieved summary body via selector {selector}", file=sys.stderr)
return content
print("FAILED: Unable to locate submission body", file=sys.stderr)
return None
def validate_summary(extracted: dict, expected: dict) -> bool:
missing = [key for key in REQUIRED_FIELDS if key not in extracted]
if missing:
print(f"FAILED: Missing required keys: {', '.join(missing)}", file=sys.stderr)
return False
errors = []
for key in REQUIRED_FIELDS:
actual = extracted.get(key, "")
expect = expected.get(key, "")
if key in NUMERIC_FIELDS:
try:
actual_val = int(actual)
expect_val = int(expect)
if actual_val != expect_val:
errors.append(f"{key}: expected {expect_val}, found {actual_val}")
except ValueError:
errors.append(f"{key}: '{actual}' is not numeric")
else:
if normalize_text(actual) != normalize_text(expect):
errors.append(f"{key}: expected '{expect}', found '{actual}'")
if errors:
print("FAILED: Summary data does not match expected values:", file=sys.stderr)
for err in errors:
print(f" - {err}", file=sys.stderr)
return False
print("✓ Summary fields align with expected snapshot", file=sys.stderr)
return True
async def verify() -> bool:
expected = load_expected_values()
if not expected:
print("FAILED: label.txt is missing", file=sys.stderr)
return False
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
if not await ensure_logged_in(page):
return False
content = await fetch_summary_body(page)
if not content:
return False
extracted = parse_key_value_format(content)
if not validate_summary(extracted, expected):
return False
print("SUCCESS: NBA statistics easy task verified", file=sys.stderr)
return True
except PlaywrightTimeoutError as exc:
print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
return False
except Exception as exc:
print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/description.md
================================================
Reshare the most helpful LifeProTips calendar advice so it appears as a fresh routine post for other readers.
**Task Requirements:**
1. Register and log in with username `RoutineTracker2025` and password `DailyRoutine123!`.
2. Find the LifeProTips thread titled `LPT: Use your calendar as your to-do list. Assigning dedicated time to tasks increases the likelyhood of you acting upon it.` and identify the comment with the highest upvote count. Copy its full text verbatim.
3. In the `LifeProTips` forum, publish a post titled `My 5-Step Morning Routine That Increased My Productivity by 200%` whose body is exactly the comment text you copied (no additional words or formatting).
4. Confirm that your new post appears in the LifeProTips listing so community members can immediately see it.
Only these three actions—copy, repost, verify—are required for the easy version; no extra upvotes or settings changes are necessary.
================================================
FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/meta.json
================================================
{
"task_id": "routine_tracker_forum_easy",
"task_name": "Routine Tracker Forum (Easy)",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Repost the highest-rated LifeProTips calendar advice under a new routine-tracking thread.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"user interaction",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/reddit/routine_tracker_forum/verify.py
================================================
import asyncio
import os
import sys
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
USERNAME = "RoutineTracker2025"
PASSWORD = "DailyRoutine123!"
FORUM_SLUG = "LifeProTips"
POST_TITLE = "My 5-Step Morning Routine That Increased My Productivity by 200%"
EXPECTED_BODY = (
"As a college student, having a visible reminder of the assignments I have and when they are due is super helpful for me. "
"It also just feels good to erase them from the board once they are completed."
)
async def ensure_logged_in(page) -> bool:
print("Step 1: Logging in before verification...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
user_button = page.locator(f'button:has-text("{USERNAME}")')
if await user_button.count():
print("✓ Already logged in", file=sys.stderr)
return True
login_link = page.locator('a:has-text("Log in")')
if not await login_link.count():
print("FAILED: Login link not found", file=sys.stderr)
return False
await login_link.click()
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', USERNAME)
await page.fill('input[name="_password"]', PASSWORD)
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
if await page.locator(f'button:has-text("{USERNAME}")').count():
print(f"✓ Logged in as {USERNAME}", file=sys.stderr)
return True
print("FAILED: Could not log in with provided credentials", file=sys.stderr)
return False
async def verify_post_body(page) -> bool:
print("Step 2: Validating reposted comment content...", file=sys.stderr)
await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
post_link = page.locator(f'a:has-text("{POST_TITLE}")')
if not await post_link.count():
print(f"FAILED: Post '{POST_TITLE}' not found in LifeProTips", file=sys.stderr)
return False
await post_link.first.click()
await page.wait_for_load_state("networkidle")
article = page.locator("article")
if not await article.count():
print("FAILED: Unable to read post body", file=sys.stderr)
return False
body_text = await article.first.inner_text()
if EXPECTED_BODY not in body_text:
print("FAILED: Post body does not match the copied comment text", file=sys.stderr)
return False
print("✓ Post body matches the expected LifeProTips comment", file=sys.stderr)
return True
async def verify_listing_presence(page) -> bool:
print("Step 3: Confirming the post appears in the forum listing...", file=sys.stderr)
await page.goto(f"{BASE_URL}/f/{FORUM_SLUG}", wait_until="networkidle")
post_link = page.locator(f'a:has-text("{POST_TITLE}")')
if await post_link.count():
print("✓ Post is visible in the LifeProTips feed", file=sys.stderr)
return True
print("FAILED: Post missing from forum listing", file=sys.stderr)
return False
async def verify() -> bool:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
if not await ensure_logged_in(page):
return False
if not await verify_post_body(page):
return False
if not await verify_listing_presence(page):
return False
print("SUCCESS: Routine tracker easy task verified", file=sys.stderr)
return True
except PlaywrightTimeoutError as exc:
print(f"FAILED: Timeout occurred - {exc}", file=sys.stderr)
return False
except Exception as exc:
print(f"FAILED: Unexpected error - {exc}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/description.md
================================================
Stick to the first three analytical steps from the original workflow so the easy version only inventories bestseller and promo data.
**Task Requirements**
1. If need to login, login with username 'admin' and password 'admin1234'.
2. **Dashboard stop**: read the first three rows in **Bestsellers** (name, price, quantity) exactly as shown, note the Revenue KPI amount, and look at the **Top Search Terms** widget—if any of those three product names appears there, record it as `term:uses`, otherwise output `No:0`.
3. **Catalog → Products stop**: search each of the same three bestseller names one at a time and copy their SKU, Qty (inventory column), and Status (Enabled/Disabled) from the grid.
4. **Marketing → Promotions → Cart Price Rules stop**: set Status = Active, count how many rules are shown, and locate the rule that applies a percentage discount so you can report `rule name:percentage`.
Output everything using the reduced template below:
```
Bestseller1|name:price:quantity:sku:inventory:status
Bestseller2|name:price:quantity:sku:inventory:status
Bestseller3|name:price:quantity:sku:inventory:status
TotalRevenue|amount
BestsellerInSearch|term:count
PercentageDiscountRule|name:percentage
ActiveRulesCount|count
```
```
Bestseller1|name:price:quantity:sku:inventory:status
Bestseller2|name:price:quantity:sku:inventory:status
Bestseller3|name:price:quantity:sku:inventory:status
TotalRevenue|amount
BestsellerInSearch|term:count
PercentageDiscountRule|name:percentage
ActiveRulesCount|count
TotalOrders|count
MostRecentOrderID|id
TopCustomer|name:email:group
SameGroupCustomers|count
```
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/label.txt
================================================
Bestseller1|Sprite Stasis Ball 65 cm:$27.00:6:24-WG082-blue:100:Enabled
Bestseller2|Quest Lumaflex™ Band:$19.00:6:24-UG01:100:Enabled
Bestseller3|Sprite Yoga Strap 6 foot:$14.00:6:24-WG085:100:Enabled
TotalRevenue|$0.00
BestsellerInSearch|No:0
PercentageDiscountRule|20% OFF Ever $200-plus purchase!*:20%
ActiveRulesCount|4
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/meta.json
================================================
{
"task_id": "fitness_promotion_strategy_easy",
"task_name": "Fitness Promotion Strategy (Easy)",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Capture the three dashboard bestsellers, confirm their catalog details, and snapshot the related promo and customer metrics needed for a quick campaign brief.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data extraction",
"comparative analysis",
"inventory management",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/fitness_promotion_strategy/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, 'r') as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if message.get('role') == 'assistant' and message.get('status') == 'completed':
content = message.get('content', [])
for item in content:
if item.get('type') == 'output_text':
return item.get('text', '')
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r'(.*?)', text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split('\n')
# Skip the check for exact number of lines - just parse what we have
# if len(lines) != 13:
# print(f"Error: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr)
# return None
for line in lines:
if '|' in line:
key, value = line.split('|', 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, 'r') as f:
lines = f.read().strip().split('\n')
expected = {}
for line in lines:
if '|' in line:
key, value = line.split('|', 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, '')
# Special handling for different types of values
if key in ['Bestseller1', 'Bestseller2', 'Bestseller3']:
# Check if all parts match (name:price:quantity:sku:inventory:status)
if ':' in expected_value and ':' in model_value:
expected_parts = expected_value.split(':')
model_parts = model_value.split(':')
if len(expected_parts) == 6 and len(model_parts) == 6:
# Compare each part
for i, (exp, mod) in enumerate(zip(expected_parts, model_parts)):
if i == 1: # Price field
exp_clean = exp.replace('$', '').replace(',', '')
mod_clean = mod.replace('$', '').replace(',', '')
if exp_clean != mod_clean:
mismatches.append(f"{key} price: expected '{exp}', got '{mod}'")
elif i == 4: # Inventory field (may have decimal places)
exp_float = float(exp.replace(',', ''))
mod_float = float(mod.replace(',', ''))
if abs(exp_float - mod_float) > 0.0001:
mismatches.append(f"{key} inventory: expected '{exp}', got '{mod}'")
else:
if exp.lower() != mod.lower():
mismatches.append(f"{key} part {i}: expected '{exp}', got '{mod}'")
else:
mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
else:
if expected_value != model_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'LowestInventoryProduct':
# Check product name and inventory
if ':' in expected_value and ':' in model_value:
expected_name, expected_inv = expected_value.rsplit(':', 1)
model_name, model_inv = model_value.rsplit(':', 1)
if expected_name.lower() != model_name.lower():
mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
exp_float = float(expected_inv.replace(',', ''))
mod_float = float(model_inv.replace(',', ''))
if abs(exp_float - mod_float) > 0.0001:
mismatches.append(f"{key} inventory: expected '{expected_inv}', got '{model_inv}'")
else:
if expected_value != model_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key in ['TotalRevenue', 'MinimumPurchaseRule']:
# For price/amount fields, normalize format
expected_clean = expected_value.replace('$', '').replace(',', '')
model_clean = model_value.replace('$', '').replace(',', '')
if expected_clean != model_clean:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'BestsellerInSearch':
# Check search term and count
if expected_value.lower() != model_value.lower():
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'PercentageDiscountRule':
# Check rule name and percentage
if ':' in expected_value and ':' in model_value:
expected_name, expected_pct = expected_value.rsplit(':', 1)
model_name, model_pct = model_value.rsplit(':', 1)
if expected_name != model_name:
mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
# Normalize percentage (20% vs 20 vs 0.20)
exp_pct_clean = expected_pct.replace('%', '').strip()
mod_pct_clean = model_pct.replace('%', '').strip()
if exp_pct_clean != mod_pct_clean:
mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'")
else:
if expected_value != model_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'TopCustomer':
# Check name:email:group
if ':' in expected_value and ':' in model_value:
expected_parts = expected_value.split(':')
model_parts = model_value.split(':')
if len(expected_parts) == 3 and len(model_parts) == 3:
exp_name, exp_email, exp_group = expected_parts
mod_name, mod_email, mod_group = model_parts
if exp_name != mod_name:
mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'")
if exp_email.lower() != mod_email.lower():
mismatches.append(f"{key} email: expected '{exp_email}', got '{mod_email}'")
if exp_group.lower() != mod_group.lower():
mismatches.append(f"{key} group: expected '{exp_group}', got '{mod_group}'")
else:
mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
else:
if expected_value != model_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'MostRecentOrderDate':
# Date format may vary, do flexible comparison
if expected_value.lower() == 'none' and model_value.lower() == 'none':
continue
elif expected_value != model_value:
# Could add more flexible date parsing here if needed
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
else:
# Exact match for other fields (counts, etc.)
if str(model_value) != str(expected_value):
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the bestseller analysis and promotion task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print("Warning: Could not parse answer format from model response", file=sys.stderr)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/description.md
================================================
Keep only the first three investigative steps so the easy task focuses on dashboard + tax + order-status insights.
**Task Requirements**
1. If need to login, login with username 'admin' and password 'admin1234'. On the **Dashboard**, record the Lifetime Sales amount, identify the cheapest product in the **Bestsellers** table (note its name, price, and quantity), and check whether that same product appears anywhere in **Last Orders** (output the customer name if yes, otherwise `No`).
2. Go to **Stores → Taxes → Tax Zones and Rates**. Capture the exact rates for New York and California, specify which state is higher, and count how many distinct U.S. states have entries in the grid.
3. Still in **Stores**, open **Settings → Order Status**, filter “Visible On Storefront = Yes”, and confirm whether a status with code `processing` exists and if it’s flagged as a default status.
Report just these metrics in the reduced answer format:
```
Lifetime_Sales_Amount|amount
Cheap_Bestseller_Name|name
Second_Bestseller_Price|price
Second_Bestseller_Quantity|quantity
Product_In_Last_Orders|yes_or_no_or_customer
NY_Tax_Rate|rate
CA_Tax_Rate|rate
Higher_Tax_State|state
Total_States_With_Tax|count
Processing_Visible_Storefront|Yes_or_No
Processing_Default_Status|Yes_or_No
```
```
Lifetime_Sales_Amount|amount
Cheap_Bestseller_Name|name
Second_Bestseller_Price|price
Second_Bestseller_Quantity|quantity
Product_In_Last_Orders|yes_or_no
NY_Tax_Rate|rate
CA_Tax_Rate|rate
Higher_Tax_State|state
Total_States_With_Tax|count
Processing_Visible_Storefront|Yes_or_No
Processing_Default_Status|Yes_or_No
Number_Of_Websites|count
Main_Store_Code|code
Default_Source_Pickup_Status|status
Default_Source_State|state_or_none
Dashboard_Revenue|amount
Tax_Shipping_Zero|yes_or_no
```
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/label.txt
================================================
Lifetime_Sales_Amount|$0.00
Cheap_Bestseller_Name|Sprite Yoga Strap 6 foot
Second_Bestseller_Price|$14.00
Second_Bestseller_Quantity|6
Product_In_Last_Orders|No
NY_Tax_Rate|8.3750
CA_Tax_Rate|8.2500
Higher_Tax_State|NY
Total_States_With_Tax|2
Processing_Visible_Storefront|Yes
Processing_Default_Status|Yes
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/meta.json
================================================
{
"task_id": "ny_expansion_analysis_easy",
"task_name": "NY Expansion Analysis (Easy)",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Capture just the dashboard, tax, order-status, store, and inventory facts required to judge if New York can launch without heavy configuration work.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/ny_expansion_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("ERROR: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
# Check if file exists
if not Path(messages_path).exists():
print(f"ERROR: Messages file not found at path: {messages_path}", file=sys.stderr)
return None
try:
with open(messages_path, 'r') as f:
content = f.read()
# Check if file is empty
if not content or content.strip() == '""':
print("ERROR: Messages file is empty or contains only empty string", file=sys.stderr)
return None
messages = json.loads(content)
# Check if messages is a list
if not isinstance(messages, list):
print(f"ERROR: Messages file should contain a list, got {type(messages).__name__}", file=sys.stderr)
return None
# Find the last assistant message
for message in reversed(messages):
if message.get('role') == 'assistant' and message.get('status') == 'completed':
content = message.get('content', [])
if not content:
print("WARNING: Assistant message has empty content", file=sys.stderr)
continue
for item in content:
if item.get('type') == 'output_text':
text = item.get('text', '')
if not text:
print("WARNING: Output text is empty", file=sys.stderr)
continue
return text
print("ERROR: No assistant response with output_text found in messages", file=sys.stderr)
return None
except json.JSONDecodeError as e:
print(f"ERROR: Invalid JSON in messages file: {str(e)}", file=sys.stderr)
return None
except Exception as e:
print(f"ERROR: Unexpected error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
print("ERROR: No text provided to parse", file=sys.stderr)
return None
# Look for ... pattern
match = re.search(r'(.*?)', text, re.IGNORECASE | re.DOTALL)
if not match:
print("ERROR: No tags found in the response", file=sys.stderr)
print(f" Response preview: {text[:200]}...", file=sys.stderr)
return None
answer_content = match.group(1).strip()
if not answer_content:
print("ERROR: Empty content between tags", file=sys.stderr)
return None
# Parse each line
result = {}
lines = answer_content.split('\n')
# Expected keys that should be present
expected_keys = [
'Lifetime_Sales_Amount', 'Cheap_Bestseller_Name', 'Second_Bestseller_Price',
'Second_Bestseller_Quantity', 'Product_In_Last_Orders', 'NY_Tax_Rate',
'CA_Tax_Rate', 'Higher_Tax_State', 'Total_States_With_Tax',
'Processing_Visible_Storefront', 'Processing_Default_Status'
]
parsed_keys = []
for line in lines:
line = line.strip()
if not line:
continue
if '|' not in line:
print(f"ERROR: Line missing pipe separator '|': {line}", file=sys.stderr)
continue
parts = line.split('|', 1)
if len(parts) != 2:
print(f"ERROR: Invalid line format: {line}", file=sys.stderr)
continue
key, value = parts
key = key.strip()
value = value.strip()
if not key:
print(f"ERROR: Empty key in line: {line}", file=sys.stderr)
continue
result[key] = value
parsed_keys.append(key)
# Check for missing expected keys
missing_keys = set(expected_keys) - set(parsed_keys)
if missing_keys:
print(f"ERROR: Missing expected keys: {', '.join(sorted(missing_keys))}", file=sys.stderr)
# Check for unexpected keys
unexpected_keys = set(parsed_keys) - set(expected_keys)
if unexpected_keys:
print(f"WARNING: Unexpected keys found: {', '.join(sorted(unexpected_keys))}", file=sys.stderr)
if not result:
print("ERROR: No valid key-value pairs parsed from answer", file=sys.stderr)
return None
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, 'r') as f:
lines = f.read().strip().split('\n')
expected = {}
for line in lines:
if '|' in line:
key, value = line.split('|', 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, '')
# Special handling for different types of values
if key in ['Lifetime_Sales_Amount', 'Second_Bestseller_Price', 'Dashboard_Revenue']:
# For price/amount fields, normalize format
expected_clean = expected_value.replace('$', '').replace(',', '')
model_clean = model_value.replace('$', '').replace(',', '')
if expected_clean != model_clean:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key in ['NY_Tax_Rate', 'CA_Tax_Rate']:
# Tax rates - allow different decimal formats
expected_clean = expected_value.replace('%', '').strip()
model_clean = model_value.replace('%', '').strip()
# Convert to float for comparison
try:
if float(expected_clean) != float(model_clean):
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
except ValueError:
if expected_clean != model_clean:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key in ['Product_In_Last_Orders', 'Processing_Visible_Storefront', 'Processing_Default_Status']:
# Yes/No fields - case insensitive
if model_value.lower() != expected_value.lower():
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'Empty_Rows_Yes_Effect':
# Allow flexible descriptions for this field
# Just check if model provided some reasonable description
if not model_value or len(model_value) < 5:
mismatches.append(f"{key}: expected meaningful description, got '{model_value}'")
elif key == 'Order_Status_Options':
# Check if main options are mentioned
expected_options = set(opt.strip() for opt in expected_value.split(','))
model_options = set(opt.strip() for opt in model_value.split(','))
if expected_options != model_options:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'Chart_Disabled_Message':
# Allow some flexibility in message text
# Check for key words
if 'disabled' not in model_value.lower() and 'enable' not in model_value.lower():
mismatches.append(f"{key}: expected message about chart being disabled, got '{model_value}'")
elif key == 'Default_Source_State':
# Handle 'None' or empty state
expected_normalized = expected_value.lower() if expected_value.lower() != 'none' else ''
model_normalized = model_value.lower() if model_value.lower() != 'none' else ''
if expected_normalized != model_normalized:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the NY expansion analysis task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
print("\n=== Starting Verification ===", file=sys.stderr)
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
print("Loading expected answer from label.txt...", file=sys.stderr)
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr)
return False
print(f"Expected answer loaded with {len(expected_answer)} keys", file=sys.stderr)
# Get model's response from MCP_MESSAGES
print("\nReading model response from MCP_MESSAGES...", file=sys.stderr)
model_response = get_model_response()
if not model_response:
print("FATAL ERROR: No valid model response found", file=sys.stderr)
return False
print(f"Model response found (length: {len(model_response)} chars)", file=sys.stderr)
print("\nParsing answer format from model response...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if not model_answer:
print("FATAL ERROR: Could not parse answer format from model response", file=sys.stderr)
return False
print(f"\n=== Model Answer Parsed Successfully ===", file=sys.stderr)
print(f"Parsed {len(model_answer)} key-value pairs", file=sys.stderr)
for key, value in model_answer.items():
print(f" {key}: {value}", file=sys.stderr)
# Compare answers
print("\n=== Comparing Model Answer with Expected Answer ===", file=sys.stderr)
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nFATAL ERROR: Model answer does not match expected answer", file=sys.stderr)
print("Verification FAILED", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
print("Verification PASSED", file=sys.stderr)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/description.md
================================================
Only keep the first few catalog and dashboard checks plus the high-level orders snapshot.
**Task Requirements**
1. If need to login, login with username 'admin' and password 'admin1234'.
2. **Catalog → Products**: search for product names containing `Yoga` and capture the records-found count; reset filters and look up SKU `WH11` to copy its exact price; reset again and set Quantity (From/To) = `0.0000` to count all zero-quantity products.
3. **Dashboard**: in the Bestsellers table sort by price ascending—record the lowest-priced row as `name:quantity`, then locate `Quest Lumaflex™ Band` and note its quantity, and read the Revenue KPI amount.
4. **Sales → Orders**: filter Status = Pending to count those orders, then search for Grace Nguyen, switch Status = Complete, sort Grand Total descending, and record the Order # of the most expensive completed order.
Return just these metrics:
```
YogaProducts|count
WH11Price|price
ZeroQuantityProducts|count
LowestProduct|name:quantity
QuestLumaflexQuantity|quantity
DashboardRevenue|amount
PendingOrders|count
GraceNguyenOrderID|orderid
```
```
YogaProducts|count
WH11Price|price
ZeroQuantityProducts|count
LowestProduct|name:quantity
QuestLumaflexQuantity|quantity
DashboardRevenue|amount
SarahMillerEmail|email
TotalCustomers|count
PendingOrders|count
GraceNguyenOrderID|orderid
```
**Example Output:**
```
YogaProducts|XX
WH11Price|$XX.XX
ZeroQuantityProducts|XX
LowestProduct|Product Name Here:XX
QuestLumaflexQuantity|XX
DashboardRevenue|$XX.XX
SarahMillerEmail|email@example.com
TotalCustomers|XX
PendingOrders|X
GraceNguyenOrderID|00000XXXX
```
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/label.txt
================================================
YogaProducts|171
WH11Price|$54.00
ZeroQuantityProducts|150
LowestProduct|Sprite Stasis Ball 55 cm foot:5
QuestLumaflexQuantity|6
DashboardRevenue|$0.00
PendingOrders|10
GraceNguyenOrderID|000000189
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/meta.json
================================================
{
"task_id": "products_sales_analysis_easy",
"task_name": "Products Sales Analysis (Easy)",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Make a single guided pass through Catalog, Dashboard, Customers, and Orders to collect the exact fields needed for a quick sales recap.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/products_sales_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
print("Error: No text provided to parse", file=sys.stderr)
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
print("Error: No ... tags found in response", file=sys.stderr)
return None
answer_content = match.group(1).strip()
if not answer_content:
print("Error: Empty answer content", file=sys.stderr)
return None
# Parse each line
result = {}
lines = [line.strip() for line in answer_content.split("\n") if line.strip()]
if len(lines) != 8:
print(f"Error: Expected 8 lines in answer, got {len(lines)}", file=sys.stderr)
print(f"Lines found: {lines}", file=sys.stderr)
return None
# Expected keys for validation
expected_keys = [
"YogaProducts", "WH11Price", "ZeroQuantityProducts", "LowestProduct",
"QuestLumaflexQuantity", "DashboardRevenue", "PendingOrders",
"GraceNguyenOrderID"
]
for line in lines:
if "|" not in line:
print(f"Error: Line missing '|' separator: {line}", file=sys.stderr)
return None
parts = line.split("|", 1)
if len(parts) != 2:
print(f"Error: Invalid line format: {line}", file=sys.stderr)
return None
key, value = parts[0].strip(), parts[1].strip()
if not key or not value:
print(f"Error: Empty key or value in line: {line}", file=sys.stderr)
return None
result[key] = value
# Validate all expected keys are present
missing_keys = set(expected_keys) - set(result.keys())
if missing_keys:
print(f"Error: Missing required keys: {missing_keys}", file=sys.stderr)
return None
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key == "LowestProduct":
# Check if product name and quantity match (format: "Product Name:quantity")
if ":" in expected_value and ":" in model_value:
expected_name, expected_qty = expected_value.rsplit(":", 1)
model_name, model_qty = model_value.rsplit(":", 1)
if expected_name != model_name or expected_qty != model_qty:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key in ["WH11Price", "DashboardRevenue"]:
# For price/amount fields, normalize format
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "SarahMillerEmail":
# Email should match exactly
if model_value.lower() != expected_value.lower():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the products and sales analysis task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/description.md
================================================
Retain just the first three analytic arenas—products, orders, and the dashboard—so the easy task stays read-only and short.
**Task Requirements**
1. If need to login, login with username 'admin' and password 'admin1234', then open **Catalog → Products**. Search for names containing `Sprite` to get their count, reset and set Quantity (From/To) = `100.0000` to count those rows, and finally reset to look up SKU `WS12` so you can copy its exact name and price.
2. Switch to **Sales → Orders**. Filter Status = Pending to count those orders, then search for Grace Nguyen with Status = Complete, sort Grand Total ascending, and capture the cheapest completed order ID. Clear filters, sort Grand Total descending, and record the top row’s customer and amount.
3. Finish in **Dashboard**. Sort **Bestsellers** by Quantity descending to capture the first row’s name and quantity, locate `Overnight Duffle` in that table to note its price, and check the **Top Search Terms** widget to see what position `hollister` occupies.
Answer with the reduced template:
```
SpriteProducts|count
Quantity100Products|count
WS12Info|name:price
PendingOrders|count
GraceOrderID|orderid
HighestOrderInfo|customer:amount
CheapProduct|name:quantity
OvernightDufflePrice|price
HollisterPosition|position
```
```
SpriteProducts|count
Quantity100Products|count
WS12Info|name:price
PendingOrders|count
GraceOrderID|orderid
HighestOrderInfo|customer:amount
CheapProduct|name:quantity
OvernightDufflePrice|price
HollisterPosition|position
CostelloCustomers|count
SarahMillerInfo|group:date
PaidInvoices|count
Invoice002BillTo|name
```
**Example Output:**
```
SpriteProducts|XX
Quantity100Products|XX
WS12Info|Product Name Here:$XX.XX
PendingOrders|X
GraceOrderID|00000XXXX
HighestOrderInfo|Customer Name:$XXX.XX
CheapProduct|Product Name:XX
OvernightDufflePrice|$XX.XX
HollisterPosition|Xth
CostelloCustomers|X
SarahMillerInfo|Group Name:MMM DD, YYYY
PaidInvoices|X
Invoice002BillTo|Customer Name
```
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/label.txt
================================================
SpriteProducts|16
Quantity100Products|1886
WS12Info|Radiant Tee:$22.00
PendingOrders|10
GraceOrderID|000000114
HighestOrderInfo|Samantha Jones:$292.40
CheapProduct|Sprite Yoga Strap 6 foot:6
OvernightDufflePrice|$45.00
HollisterPosition|1st
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/meta.json
================================================
{
"task_id": "sales_inventory_analysis_easy",
"task_name": "Sales Inventory Analysis (Easy)",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Follow one guided tour through Products, Orders, Dashboard, Customers, and Invoices to capture a compact set of sales-plus-inventory facts.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data extraction",
"comparative analysis",
"inventory management"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/sales_inventory_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message with type='message', status='completed'
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
# Check for both 'text' and 'output_text' types
if item.get("type") in ["text", "output_text"]:
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
print("ERROR: No text provided to parse", file=sys.stderr)
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
print("ERROR: No ... tags found in the response", file=sys.stderr)
print("Response text preview (first 200 chars):", text[:200], file=sys.stderr)
return None
answer_content = match.group(1).strip()
print(f"Found answer content with {len(answer_content)} characters", file=sys.stderr)
# Parse each line
result = {}
lines = answer_content.split("\n")
# Expected keys for this task
expected_keys = [
"SpriteProducts", "Quantity100Products", "WS12Info", "PendingOrders",
"GraceOrderID", "HighestOrderInfo", "CheapProduct", "OvernightDufflePrice",
"HollisterPosition"
]
if len(lines) != 9:
print(f"ERROR: Expected 9 lines in answer, got {len(lines)}", file=sys.stderr)
print(f"Lines found: {lines}", file=sys.stderr)
return None
for i, line in enumerate(lines, 1):
if "|" not in line:
print(f"ERROR: Line {i} does not contain pipe separator '|': '{line}'", file=sys.stderr)
return None
parts = line.split("|", 1)
if len(parts) != 2:
print(f"ERROR: Line {i} could not be split into key|value: '{line}'", file=sys.stderr)
return None
key, value = parts
result[key.strip()] = value.strip()
# Check if all expected keys are present
missing_keys = set(expected_keys) - set(result.keys())
if missing_keys:
print(f"ERROR: Missing expected keys: {missing_keys}", file=sys.stderr)
print(f"Keys found: {list(result.keys())}", file=sys.stderr)
return None
# Check for unexpected keys
extra_keys = set(result.keys()) - set(expected_keys)
if extra_keys:
print(f"WARNING: Unexpected keys found: {extra_keys}", file=sys.stderr)
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key == "WS12Info":
# Check if product name and price match (format: name:price)
if ":" in expected_value and ":" in model_value:
expected_name, expected_price = expected_value.rsplit(":", 1)
model_name, model_price = model_value.rsplit(":", 1)
# Normalize price format
expected_price_clean = expected_price.replace("$", "").replace(",", "")
model_price_clean = model_price.replace("$", "").replace(",", "")
if (
expected_name != model_name
or expected_price_clean != model_price_clean
):
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "GraceOrderID":
# Order ID should start with "000" and match exactly
if not model_value.startswith("000"):
mismatches.append(
f"{key}: expected to start with '000', got '{model_value}'"
)
elif model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "HighestOrderInfo":
# Check format customer:amount
if ":" in expected_value and ":" in model_value:
expected_customer, expected_amount = expected_value.rsplit(":", 1)
model_customer, model_amount = model_value.rsplit(":", 1)
# Normalize amount format
expected_amount_clean = expected_amount.replace("$", "").replace(
",", ""
)
model_amount_clean = model_amount.replace("$", "").replace(",", "")
if (
expected_customer != model_customer
or expected_amount_clean != model_amount_clean
):
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "Position2Product":
# Check if product name and quantity match
if ":" in expected_value and ":" in model_value:
expected_name, expected_qty = expected_value.rsplit(":", 1)
model_name, model_qty = model_value.rsplit(":", 1)
if expected_name != model_name or expected_qty != model_qty:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "OvernightDufflePrice":
# Normalize price format
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "HollisterPosition":
# Position format (1st, 2nd, 3rd, etc.)
if model_value.lower() != expected_value.lower():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "SarahMillerInfo":
# Format: group:date
if ":" in expected_value and ":" in model_value:
expected_group, expected_date = expected_value.split(":", 1)
model_group, model_date = model_value.split(":", 1)
# Allow some flexibility in date format
if expected_group != model_group:
mismatches.append(
f"{key}: expected group '{expected_group}', got '{model_group}'"
)
# For date, check if key parts match
if not (expected_date in model_date or model_date in expected_date):
mismatches.append(
f"{key}: expected date '{expected_date}', got '{model_date}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "Invoice002BillTo":
# Name should match exactly
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for count fields and other numeric values
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the sales and inventory analysis task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
print("\n" + "="*60, file=sys.stderr)
print("Starting verification of Task 5", file=sys.stderr)
print("="*60, file=sys.stderr)
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
print("\n--- Loading Expected Answer ---", file=sys.stderr)
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr)
return False
print(f"Successfully loaded {len(expected_answer)} expected values", file=sys.stderr)
# Get model's response from MCP_MESSAGES
print("\n--- Loading Model Response ---", file=sys.stderr)
model_response = get_model_response()
if not model_response:
print("FATAL ERROR: No model response found in MCP_MESSAGES", file=sys.stderr)
return False
print(f"Found model response ({len(model_response)} characters)", file=sys.stderr)
print("\n--- Parsing Answer Format ---", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if not model_answer:
print("\nFATAL ERROR: Could not parse answer format from model response", file=sys.stderr)
print("Verification FAILED", file=sys.stderr)
return False
print("\n=== Model Answer Successfully Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f" {key}: {value}", file=sys.stderr)
# Compare answers
print("\n--- Comparing Answers ---", file=sys.stderr)
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\n" + "="*60, file=sys.stderr)
print("VERIFICATION FAILED: Model answer does not match expected answer", file=sys.stderr)
print("="*60, file=sys.stderr)
return False
print("\n" + "="*60, file=sys.stderr)
print("✓ VERIFICATION PASSED: Model answer matches expected answer", file=sys.stderr)
print("="*60, file=sys.stderr)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/description.md
================================================
Limit the search intelligence pass to the first three steps from the original task so it’s just two Search Terms views plus one dashboard glance.
**Task Requirements**
1. If need to login, login with username 'admin' and password 'admin1234'.
2. **Marketing → SEO & Search → Search Terms**: filter for queries containing `tank` to count them, reset and filter Results = 0 to count zero-result terms, then filter Uses ≥ 11 to capture the highest-use row and list every term whose Results are between 20 and 30 (join as `term:results`, or use `None:0` if none). Remove filters when done.
3. **Reports → Search Terms**: set Hits ≥ 16 and record the filtered count, then add ID range 10–15 and capture the row with the most Results, and finally switch Store View to “Default Store View” to count those entries.
4. **Dashboard**: in **Top Search Terms** list the entries whose Results = 1 (format `term:uses` joined with `|` or `None:0`), in **Last Search Terms** pick the row with the highest combination of Results and Uses, and in **Bestsellers** copy the product + quantity shown at position #3.
Return only these data points:
```
TankSearchCount|count
ZeroResultsCount|count
HighestUseTerm|term:uses
Results20to30Term|term1:results1|term2:results2|...
Hits15PlusCount|count
ID10to15MaxResults|term:results
DefaultStoreViewCount|count
OneResultTerm|term1:uses1|term2:uses2|...
HighestResultLastSearch|term:results
Position3Bestseller|product:quantity
```
```
TankSearchCount|count
ZeroResultsCount|count
HighestUseTerm|term:uses
Results20to30Term|term1:results1|term2:result2|term3:result3|...
Hits15PlusCount|count
ID10to15MaxResults|term:results
DefaultStoreViewCount|count
OneResultTerm|term1:uses1|term2:uses2|term3:uses3|...
HighestResultLastSearch|term:results
Position3Bestseller|product:quantity
TopUseTerm|term:uses
FirstNonZeroResult|term:results
TotalUniqueTerms|count
```
**Example Output:**
```
TankSearchCount|X
ZeroResultsCount|X
HighestUseTerm|search_term:XX
Results20to30Term|search_term1:XX1|search_term2:XX2|search_term3:XX3|...
Hits15PlusCount|X
ID10to15MaxResults|Product Name:XX
DefaultStoreViewCount|X
OneResultTerm|search_term1:XX1|search_term2:XX2|search_term3:XX3|...
HighestResultLastSearch|search_term:XX
Position3Bestseller|Product Name:X
TopUseTerm|search_term:XX
FirstNonZeroResult|search_term:X
TotalUniqueTerms|X
```
**Success Criteria:**
- Successfully logged into Magento Admin
- Applied complex search filters in Search Terms section
- Used range filters for results and hits
- Sorted columns to find specific records
- Navigated between different report views
- Extracted data from filtered and sorted results
- Counted records accurately after applying filters
- Output answer in exact format with 13 data lines
- Answer wrapped in tags
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/label.txt
================================================
TankSearchCount|2
ZeroResultsCount|1
HighestUseTerm|hollister:19
Results20to30Term|Antonia Racer Tank:23|tanks:23
Hits15PlusCount|1
ID10to15MaxResults|Antonia Racer Tank:23
DefaultStoreViewCount|7
OneResultTerm|hollister:19|WP10:1
HighestResultLastSearch|Antonia Racer Tank:23
Position3Bestseller|Sprite Stasis Ball 65 cm:6
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/meta.json
================================================
{
"task_id": "search_filtering_operations_easy",
"task_name": "Search Filtering Operations (Easy)",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Follow a clearly guided path through Search Terms, the Search Terms report, and the dashboard widgets to capture the metrics needed for a focused search-behavior brief.",
"author": "Fanqing Meng",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/easy/shopping_admin/search_filtering_operations/verify.py
================================================
import re
import json
import os
import sys
def verify(messages):
"""
Verify that the agent has successfully performed complex search and filtering operations
in the Magento Admin panel and extracted all required information correctly.
Args:
messages: List of message dictionaries containing the conversation
Returns:
Dictionary with 'valid' boolean and 'reason' string
"""
# Find the last assistant message with status "completed" and type "message"
answer_content = None
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
and message.get("content")
):
# Extract text from content structure
content = message["content"]
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get("type") == "output_text":
text = item.get("text", "")
# Look for answer tags with case-insensitive search
answer_match = re.search(
r"(.*?)", text, re.DOTALL | re.IGNORECASE
)
if answer_match:
answer_content = answer_match.group(1).strip()
break
elif isinstance(content, str):
# Look for answer tags in string content
answer_match = re.search(r"(.*?)", content, re.DOTALL | re.IGNORECASE)
if answer_match:
answer_content = answer_match.group(1).strip()
break
if answer_content:
break
if not answer_content:
return {"valid": False, "reason": "No answer found in tags"}
# Expected format - each line should have a key|value pair
expected_keys = [
"TankSearchCount",
"ZeroResultsCount",
"HighestUseTerm",
"Results20to30Term",
"Hits15PlusCount",
"ID10to15MaxResults",
"DefaultStoreViewCount",
"OneResultTerm",
"HighestResultLastSearch",
"Position3Bestseller",
]
# Parse the answer
lines = answer_content.strip().split("\n")
# Check if we have exactly 10 lines
if len(lines) != 10:
return {"valid": False, "reason": f"Expected 10 data lines, found {len(lines)}"}
# Parse each line and validate format
extracted_data = {}
for line in lines:
if "|" not in line:
return {
"valid": False,
"reason": f"Invalid format in line: {line}. Expected 'key|value' format",
}
parts = line.split("|", 1)
if len(parts) != 2:
return {"valid": False, "reason": f"Invalid format in line: {line}"}
key, value = parts
extracted_data[key] = value
# Check all required keys are present
missing_keys = set(expected_keys) - set(extracted_data.keys())
if missing_keys:
return {
"valid": False,
"reason": f"Missing required keys: {', '.join(missing_keys)}",
}
# Validate specific data formats and expected values based on the current data
# 1. TankSearchCount should be a number (2 terms containing 'tank')
if not extracted_data["TankSearchCount"].isdigit():
return {
"valid": False,
"reason": f"TankSearchCount should be a number, got: {extracted_data['TankSearchCount']}",
}
# Expected: "Antonia Racer Tank" and "tanks" contain 'tank'
if extracted_data["TankSearchCount"] != "2":
return {
"valid": False,
"reason": f"TankSearchCount should be '2', got: {extracted_data['TankSearchCount']}",
}
# 2. ZeroResultsCount should be a number (nike has 0 results)
if not extracted_data["ZeroResultsCount"].isdigit():
return {
"valid": False,
"reason": f"ZeroResultsCount should be a number, got: {extracted_data['ZeroResultsCount']}",
}
if extracted_data["ZeroResultsCount"] != "1":
return {
"valid": False,
"reason": f"ZeroResultsCount should be '1', got: {extracted_data['ZeroResultsCount']}",
}
# 3. HighestUseTerm should be in format "term:uses"
if ":" not in extracted_data["HighestUseTerm"]:
return {
"valid": False,
"reason": f"HighestUseTerm should be in format 'term:uses', got: {extracted_data['HighestUseTerm']}",
}
# hollister has 19 uses (highest among terms with > 10 uses)
if extracted_data["HighestUseTerm"] != "hollister:19":
return {
"valid": False,
"reason": f"HighestUseTerm should be 'hollister:19', got: {extracted_data['HighestUseTerm']}",
}
# 4. Results20to30Term should be in format "term:results"
if ":" not in extracted_data["Results20to30Term"]:
return {
"valid": False,
"reason": f"Results20to30Term should be in format 'term:results', got: {extracted_data['Results20to30Term']}",
}
# Both "tanks" and "Antonia Racer Tank" have 23 results (between 20-30)
valid_results20to30 = ["tanks:23", "Antonia Racer Tank:23"]
# Check if answer contains one of the valid values or both separated by |
if not any(
val in extracted_data["Results20to30Term"] for val in valid_results20to30
):
return {
"valid": False,
"reason": f"Results20to30Term should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['Results20to30Term']}",
}
# 5. Hits15PlusCount should be a number (only hollister has 19 hits > 15)
if not extracted_data["Hits15PlusCount"].isdigit():
return {
"valid": False,
"reason": f"Hits15PlusCount should be a number, got: {extracted_data['Hits15PlusCount']}",
}
if extracted_data["Hits15PlusCount"] != "1":
return {
"valid": False,
"reason": f"Hits15PlusCount should be '1', got: {extracted_data['Hits15PlusCount']}",
}
# 6. ID10to15MaxResults should be in format "term:results"
if ":" not in extracted_data["ID10to15MaxResults"]:
return {
"valid": False,
"reason": f"ID10to15MaxResults should be in format 'term:results', got: {extracted_data['ID10to15MaxResults']}",
}
# ID 11 is hollister (1 result), ID 13 is Antonia Racer Tank (23 results)
if extracted_data["ID10to15MaxResults"] != "Antonia Racer Tank:23":
return {
"valid": False,
"reason": f"ID10to15MaxResults should be 'Antonia Racer Tank:23', got: {extracted_data['ID10to15MaxResults']}",
}
# 7. DefaultStoreViewCount should be a number (all 7 terms are from Default Store View)
if not extracted_data["DefaultStoreViewCount"].isdigit():
return {
"valid": False,
"reason": f"DefaultStoreViewCount should be a number, got: {extracted_data['DefaultStoreViewCount']}",
}
if extracted_data["DefaultStoreViewCount"] != "7":
return {
"valid": False,
"reason": f"DefaultStoreViewCount should be '7', got: {extracted_data['DefaultStoreViewCount']}",
}
# 8. OneResultTerm should be in format "term:uses"
if ":" not in extracted_data["OneResultTerm"]:
return {
"valid": False,
"reason": f"OneResultTerm should be in format 'term:uses', got: {extracted_data['OneResultTerm']}",
}
# Both hollister and WP10 have exactly 1 result
valid_one_result = ["hollister:19", "WP10:1"]
if not any(val in extracted_data["OneResultTerm"] for val in valid_one_result):
return {
"valid": False,
"reason": f"OneResultTerm should contain 'hollister:19' or 'WP10:1', got: {extracted_data['OneResultTerm']}",
}
# 9. HighestResultLastSearch should be in format "term:results"
if ":" not in extracted_data["HighestResultLastSearch"]:
return {
"valid": False,
"reason": f"HighestResultLastSearch should be in format 'term:results', got: {extracted_data['HighestResultLastSearch']}",
}
# In Last Search Terms: tanks and Antonia Racer Tank both have 23 results (highest)
valid_highest_last = ["tanks:23", "Antonia Racer Tank:23"]
if not any(
val in extracted_data["HighestResultLastSearch"] for val in valid_highest_last
):
return {
"valid": False,
"reason": f"HighestResultLastSearch should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['HighestResultLastSearch']}",
}
# 10. Position3Bestseller should be in format "product:quantity"
if ":" not in extracted_data["Position3Bestseller"]:
return {
"valid": False,
"reason": f"Position3Bestseller should be in format 'product:quantity', got: {extracted_data['Position3Bestseller']}",
}
# Position 3 in Bestsellers is "Sprite Stasis Ball 65 cm" with quantity 6
if extracted_data["Position3Bestseller"] != "Sprite Stasis Ball 65 cm:6":
return {
"valid": False,
"reason": f"Position3Bestseller should be 'Sprite Stasis Ball 65 cm:6', got: {extracted_data['Position3Bestseller']}",
}
# All validations passed
return {
"valid": True,
"reason": "All complex search and filtering operations completed successfully",
}
if __name__ == "__main__":
# Load messages from environment variable
messages_path = os.getenv("MCP_MESSAGES")
if not messages_path:
print(
json.dumps(
{"valid": False, "reason": "MCP_MESSAGES environment variable not set"}
)
)
exit(1)
try:
with open(messages_path, "r") as f:
messages = json.load(f)
except Exception as e:
print(
json.dumps({"valid": False, "reason": f"Failed to load messages: {str(e)}"})
)
exit(1)
# Run verification
result = verify(messages)
print(json.dumps(result))
# Exit with appropriate code based on verification result
sys.exit(0 if result["valid"] else 1)
================================================
FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/description.md
================================================
I'm conducting an AI research analysis and need specific data from this forum platform. Please help me gather information about AI discussions and community engagement.
**Task Requirements:**
1. Create a new account with the exact 'AIDataAnalyst2025' and password 'SecurePass123!'
2. After logging in, navigate to the deeplearning forum and count the exact number of posts visible on the first page
3. Go to the MachineLearning forum and find the post titled '[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)' - record its exact vote count
4. Use the search bar to search for 'transformer' and record the exact title of the third search result
5. In the MachineLearning forum, click on the 'Sort by: Hot' button and change it to 'New'. Navigate to page 2 of the results and find the post with the highest upvote count on that page:
- Record the exact post title
- Record the exact vote count
- Click through to the post and find the last comment
- Record the username of the last commenter
- Record the exact text of the last comment
6. After collecting all the data, go to the MachineLearning forum and submit a new post with:
- Title: "MachineLearning_Extraction"
- Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):
```
- Deeplearning_Post_Count|FILL_VALUE
- ChatGPT_Tool_Vote_Count|FILL_VALUE
- Transformer_Third_Result|FILL_VALUE
- Page2_Top_Post_Title|FILL_VALUE
- Page2_Top_Post_Votes|FILL_VALUE
- Page2_Last_Comment_Username|FILL_VALUE
- Page2_Last_Comment_Text|FILL_VALUE
```
================================================
FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/label.txt
================================================
- Deeplearning_Post_Count|25
- ChatGPT_Tool_Vote_Count|2655
- Transformer_Third_Result|[R] The Table Feature Transformation Library Release
- Page2_Top_Post_Title|[D]GPT-4 might be able to tell you if it hallucinated
- Page2_Top_Post_Votes|634
- Page2_Last_Comment_Username|Nous_AI
- Page2_Last_Comment_Text|Fascinating.
================================================
FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/meta.json
================================================
{
"task_id": "ai_data_analyst",
"task_name": "AI Data Analyst",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Create account on forum platform, collect AI/ML discussion metrics including post counts, vote data, and analyze community engagement patterns through systematic data extraction.",
"author": "Fanqing Meng",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"user interaction",
"data extraction",
"comparative analysis"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/reddit/ai_data_analyst/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
# 从环境变量读取 base_url,默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
def parse_key_value_format(text):
"""
Parse the Key|Value format from the submission body using regex.
Works with markdown format using pipe separators, with or without list markers.
"""
data = {}
# Define patterns for each field using pipe separator
# Optional list markers (-, •, *) at the beginning
patterns = {
"Deeplearning_Post_Count": r"(?:[-•*]\s*)?Deeplearning_Post_Count\s*\|\s*(\d+)",
"ChatGPT_Tool_Vote_Count": r"(?:[-•*]\s*)?ChatGPT_Tool_Vote_Count\s*\|\s*(\d+)",
"Transformer_Third_Result": r"(?:[-•*]\s*)?Transformer_Third_Result\s*\|\s*(.+?)(?=\n|$)",
"Page2_Top_Post_Title": r"(?:[-•*]\s*)?Page2_Top_Post_Title\s*\|\s*(.+?)(?=\n|$)",
"Page2_Top_Post_Votes": r"(?:[-•*]\s*)?Page2_Top_Post_Votes\s*\|\s*(\d+)",
"Page2_Last_Comment_Username": r"(?:[-•*]\s*)?Page2_Last_Comment_Username\s*\|\s*(.+?)(?=\n|$)",
"Page2_Last_Comment_Text": r"(?:[-•*]\s*)?Page2_Last_Comment_Text\s*\|\s*(.+?)(?=\n|$)",
}
# Extract each field using regex
for key, pattern in patterns.items():
match = re.search(pattern, text, re.MULTILINE)
if match:
# For text fields, clean up newlines and extra spaces
value = match.group(1).strip()
if key not in [
"Deeplearning_Post_Count",
"ChatGPT_Tool_Vote_Count",
"Page2_Top_Post_Votes",
]:
# Replace newlines with spaces and normalize whitespace
value = " ".join(value.split())
data[key] = value
return data
def normalize_text(text):
"""
Normalize text for comparison by handling different quote styles and whitespace.
"""
if not isinstance(text, str):
return str(text)
# Replace various quote styles with standard quotes
text = text.replace(""", "'").replace(""", "'")
text = text.replace('"', '"').replace('"', '"')
# Normalize whitespace
text = " ".join(text.split())
return text.strip()
async def verify() -> bool:
"""
Verifies that the ML extraction task has been completed correctly by checking the forum post.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Navigate to the main page
print("Navigating to forum...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
# Step 1: Check if account was created correctly by trying to login
print("Step 1: Verifying account creation...", file=sys.stderr)
user_button = page.locator('button:has-text("AIDataAnalyst2025")')
if not await user_button.count():
# Try to login
print("Attempting to login with AIDataAnalyst2025...", file=sys.stderr)
# Click login link
await page.click('a:has-text("Log in")')
await page.wait_for_load_state("networkidle")
# Fill login form
await page.fill('input[name="_username"]', "AIDataAnalyst2025")
await page.fill('input[name="_password"]', "SecurePass123!")
# Submit login form
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
# Check if login successful
user_button = page.locator('button:has-text("AIDataAnalyst2025")')
if not await user_button.count():
print("FAILED: Account AIDataAnalyst2025 with password SecurePass123! cannot be logged in", file=sys.stderr)
print("This means the account was not created correctly", file=sys.stderr)
return False
print("PASSED: Successfully logged in as AIDataAnalyst2025", file=sys.stderr)
else:
print("PASSED: Already logged in as AIDataAnalyst2025", file=sys.stderr)
# Step 2: Check if submission was created correctly in MachineLearning forum
print("\nStep 2: Verifying submission creation...", file=sys.stderr)
print("Navigating to MachineLearning forum...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/f/MachineLearning", wait_until="networkidle"
)
# Look for the post with title "MachineLearning_Extraction"
print(
"Looking for submission with title 'MachineLearning_Extraction'...",
file=sys.stderr,
)
post_link = page.locator('a:has-text("MachineLearning_Extraction")')
if not await post_link.count():
print(
"FAILED: Could not find submission with title 'MachineLearning_Extraction' in MachineLearning forum",
file=sys.stderr,
)
return False
print("PASSED: Found submission 'MachineLearning_Extraction' in MachineLearning forum", file=sys.stderr)
# Step 3: Check submission content matches expected values
print("\nStep 3: Verifying submission content...", file=sys.stderr)
# Click on the submission to view its content
await post_link.first.click()
await page.wait_for_load_state("networkidle")
# Extract the submission body content
# Try multiple possible selectors for the post body
post_content = None
selectors = [
".submission__body",
".post-body",
".RichText",
'[class*="RichText"]',
'div:has(> p:has-text("Deeplearning_Post_Count"))',
'div:has-text("Deeplearning_Post_Count"):has-text("Page2_Last_Comment_Text")',
]
for selector in selectors:
content_element = page.locator(selector)
if await content_element.count():
post_content = await content_element.first.inner_text()
if "Deeplearning_Post_Count" in post_content:
print(
f"Found submission content using selector: {selector}",
file=sys.stderr,
)
break
if not post_content or "Deeplearning_Post_Count" not in post_content:
print(
"FAILED: Could not find submission body with required format",
file=sys.stderr,
)
print(
"Expected body to contain 'Deeplearning_Post_Count' in pipe-separated format",
file=sys.stderr,
)
return False
print("Found submission body content", file=sys.stderr)
print(f"Raw content preview: {post_content[:200]}...", file=sys.stderr)
# Parse the Key: Value format
extracted_data = parse_key_value_format(post_content)
print(f"Extracted data: {extracted_data}", file=sys.stderr)
# Load expected values from label.txt
label_path = Path(__file__).parent / "label.txt"
if label_path.exists():
with open(label_path, "r") as f:
expected_text = f.read().strip()
expected_data = parse_key_value_format(expected_text)
print("Loaded expected values from label.txt", file=sys.stderr)
# Verify all required keys are present
required_keys = [
"Deeplearning_Post_Count",
"ChatGPT_Tool_Vote_Count",
"Transformer_Third_Result",
"Page2_Top_Post_Title",
"Page2_Top_Post_Votes",
"Page2_Last_Comment_Username",
"Page2_Last_Comment_Text",
]
missing_keys = []
for key in required_keys:
if key not in extracted_data:
missing_keys.append(key)
if missing_keys:
print(
"FAILED: Missing required keys in submission: {', '.join(missing_keys)}",
file=sys.stderr,
)
print(
"Expected all 7 fields to be present in pipe-separated format",
file=sys.stderr,
)
return False
# Validate data format and content
errors = []
# Check numeric fields
try:
post_count = int(extracted_data["Deeplearning_Post_Count"])
if (
"expected_data" in locals()
and "Deeplearning_Post_Count" in expected_data
):
expected_count = int(expected_data["Deeplearning_Post_Count"])
if post_count != expected_count:
errors.append(
f"Deeplearning_Post_Count mismatch: got {post_count}, expected {expected_count}"
)
except ValueError:
errors.append(
f"Deeplearning_Post_Count must be a number, got: {extracted_data['Deeplearning_Post_Count']}"
)
# If we have expected data, compare against it
if "expected_data" in locals():
# Compare each field
for key in required_keys:
if key in expected_data and key in extracted_data:
expected_val = normalize_text(expected_data[key])
actual_val = normalize_text(extracted_data[key])
# For numeric fields, compare as integers
if key in [
"Deeplearning_Post_Count",
"ChatGPT_Tool_Vote_Count",
"Page2_Top_Post_Votes",
]:
try:
expected_int = int(expected_val)
actual_int = int(actual_val)
if expected_int != actual_int:
errors.append(
f"{key} mismatch: got {actual_int}, expected {expected_int}"
)
except ValueError:
errors.append(
f"{key} should be numeric: got '{actual_val}'"
)
else:
# For text fields, compare normalized text
if expected_val != actual_val:
errors.append(
f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
)
else:
# If no expected data, just do basic validation
for key in required_keys:
if key not in extracted_data:
errors.append(f"Missing required key: {key}")
elif (
not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
):
errors.append(f"{key} was not filled in")
if errors:
print(
"FAILED: Content validation failed with the following issues:",
file=sys.stderr,
)
for error in errors:
print(f" - {error}", file=sys.stderr)
print("\nExpected values from label.txt:", file=sys.stderr)
if "expected_data" in locals():
for key in required_keys:
if key in expected_data:
print(f" {key}: {expected_data[key]}", file=sys.stderr)
return False
# All checks passed
print("\n=== VERIFICATION SUCCESSFUL ===")
print("✓ Step 1: Account AIDataAnalyst2025 can login with password SecurePass123!")
print("✓ Step 2: Submission 'MachineLearning_Extraction' found in MachineLearning forum")
print("✓ Step 3: All submission content matches expected values:")
print(f" - Deeplearning_Post_Count: {extracted_data['Deeplearning_Post_Count']}")
print(f" - ChatGPT_Tool_Vote_Count: {extracted_data['ChatGPT_Tool_Vote_Count']}")
print(f" - Transformer_Third_Result: {extracted_data['Transformer_Third_Result']}")
print(f" - Page2_Top_Post_Title: {extracted_data['Page2_Top_Post_Title']}")
print(f" - Page2_Top_Post_Votes: {extracted_data['Page2_Top_Post_Votes']}")
print(f" - Page2_Last_Comment_Username: {extracted_data['Page2_Last_Comment_Username']}")
print(f" - Page2_Last_Comment_Text: {extracted_data['Page2_Last_Comment_Text']}")
print("✓ All data in correct pipe-separated markdown format")
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/description.md
================================================
Create a comprehensive budget travel resource for European travel on the Postmill forum platform by establishing a dedicated travel community with integrated wiki guides and engaging content.
**Task Requirements:**
1. Register a new account with username "EuroTravelPlanner" and password "SecureTravel2024!"
2. After successful registration and login, search for "Europe travel budget" using the search bar to check for existing content
3. Create a new forum dedicated to budget European travel:
- Forum name: "BudgetEuropeTravel"
- Title: "Budget Travel Europe"
- Description: "Community for sharing money-saving tips for European travel"
- Sidebar content: "Share your best European travel deals and budget tips here!"
4. Create a comprehensive wiki page for your forum:
- Path: "europe-travel-budget-guide"
- Title: "Complete Budget Travel Guide for Europe 2024"
- Content must contain: "Eurail passes and budget airlines"
5. Navigate to your newly created forum and create your first post:
- Title: "My 14-day Europe trip for under 1000 - Complete itinerary"
- Body must contain: "budget guide wiki"
6. Search for "travel insurance Europe" using the search function:
- upvote the first post
7. Navigate to user settings and configure:
- Set timezone to "Europe/Amsterdam" (to match your travel planning focus)
- Ensure "Notify on reply" is checked (enabled) for community engagement
================================================
FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/meta.json
================================================
{
"task_id": "budget_europe_travel",
"task_name": "Budget Europe Travel",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Create comprehensive budget travel resource for European destinations by establishing dedicated travel community with integrated wiki guides and engaging travel content.",
"author": "Fanqing Meng",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"content submission",
"user interaction"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/reddit/budget_europe_travel/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
def normalize_text(text):
"""
Normalize text for comparison by handling different quote styles and whitespace.
"""
if not isinstance(text, str):
return str(text)
# Replace various quote styles with standard quotes
text = text.replace('\'', "'").replace('\'', "'")
text = text.replace('"', '"').replace('"', '"')
text = text.replace('&', '&')
# Normalize whitespace
text = ' '.join(text.split())
return text.strip()
async def verify() -> bool:
"""
Verifies that the budget Europe travel resource task has been completed correctly.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
verification_passed = True
try:
# 1. Check if account can login with correct credentials
print("="*60)
print("Step 1: Verifying account login with credentials...", file=sys.stderr)
print("="*60)
await page.goto(f"{BASE_URL}/", wait_until='networkidle')
# First logout if already logged in
user_button = page.locator('button:has-text("EuroTravelPlanner")')
if await user_button.count():
print("Already logged in, logging out first...", file=sys.stderr)
await user_button.click()
logout_link = page.locator('a:has-text("Log out")')
if await logout_link.count():
await logout_link.click()
await page.wait_for_load_state('networkidle')
# Now try to login with the specified credentials
print("Attempting to login with username 'EuroTravelPlanner' and password 'SecureTravel2024!'...", file=sys.stderr)
# Navigate to login page
login_link = page.locator('a:has-text("Log in")')
if await login_link.count():
await login_link.click()
await page.wait_for_load_state('networkidle')
else:
print("❌ ERROR: Cannot find login link", file=sys.stderr)
verification_passed = False
if verification_passed:
# Fill login form with exact credentials
await page.fill('input[name="_username"]', 'EuroTravelPlanner')
await page.fill('input[name="_password"]', 'SecureTravel2024!')
# Submit login
login_button = page.locator('button[type="submit"]:has-text("Log in")')
if not await login_button.count():
login_button = page.locator('button:has-text("Log in")')
await login_button.click()
await page.wait_for_load_state('networkidle')
# Verify login success
user_button = page.locator('button:has-text("EuroTravelPlanner")')
if not await user_button.count():
print("❌ ERROR: Login failed with username 'EuroTravelPlanner' and password 'SecureTravel2024!'", file=sys.stderr)
verification_passed = False
else:
print("✓ Account login successful with correct credentials", file=sys.stderr)
# 2. Check if forum exists and has correct properties
print("\n" + "="*60)
print("Step 2: Checking forum existence and properties...", file=sys.stderr)
print("="*60)
# Check if forum exists at /f/BudgetEuropeTravel
await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel", wait_until='networkidle')
# Check if we get 404 or the forum exists
page_content = await page.content()
page_title = await page.title()
if "404" in page_title or "not found" in page_title.lower() or "Page not found" in page_content:
print("❌ ERROR: Forum /f/BudgetEuropeTravel does not exist (404)", file=sys.stderr)
verification_passed = False
else:
print("✓ Forum /f/BudgetEuropeTravel exists", file=sys.stderr)
# Navigate to edit page to check properties
await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel/edit", wait_until='networkidle')
# Check if we can access edit page
edit_page_content = await page.content()
edit_page_title = await page.title()
if "404" in edit_page_title or "not found" in edit_page_title.lower() or "Page not found" in edit_page_content:
print("❌ ERROR: Cannot access forum edit page at /f/BudgetEuropeTravel/edit", file=sys.stderr)
verification_passed = False
else:
print("✓ Forum edit page accessible", file=sys.stderr)
# Check forum title
title_input = page.locator('input[name*="title"], input#forum_title')
if await title_input.count():
title_value = await title_input.input_value()
if title_value != "Budget Travel Europe":
print(f"❌ ERROR: Forum title is '{title_value}', expected 'Budget Travel Europe'", file=sys.stderr)
verification_passed = False
else:
print("✓ Forum title correct: 'Budget Travel Europe'", file=sys.stderr)
else:
print("❌ ERROR: Cannot find forum title field", file=sys.stderr)
verification_passed = False
# Check forum description
desc_input = page.locator('textarea[name*="description"], input[name*="description"]')
if await desc_input.count():
desc_value = await desc_input.input_value()
expected_desc = "Community for sharing money-saving tips for European travel"
if desc_value != expected_desc:
print(f"❌ ERROR: Forum description is '{desc_value}', expected '{expected_desc}'", file=sys.stderr)
verification_passed = False
else:
print("✓ Forum description correct", file=sys.stderr)
else:
print("❌ ERROR: Cannot find forum description field", file=sys.stderr)
verification_passed = False
# Check sidebar content
sidebar_input = page.locator('textarea[name*="sidebar"]')
if await sidebar_input.count():
sidebar_value = await sidebar_input.input_value()
expected_sidebar = "Share your best European travel deals and budget tips here!"
if sidebar_value != expected_sidebar:
print(f"❌ ERROR: Forum sidebar is '{sidebar_value}', expected '{expected_sidebar}'", file=sys.stderr)
verification_passed = False
else:
print("✓ Forum sidebar correct", file=sys.stderr)
else:
print("❌ ERROR: Cannot find forum sidebar field", file=sys.stderr)
verification_passed = False
# 3. Check wiki page existence and content
print("\n" + "="*60)
print("Step 3: Checking wiki page existence and content...", file=sys.stderr)
print("="*60)
# Try the wiki URL with /wiki/ path
await page.goto(f"{BASE_URL}/wiki/europe-travel-budget-guide", wait_until='networkidle')
wiki_page_content = await page.content()
wiki_page_title = await page.title()
if "404" in wiki_page_title or "not found" in wiki_page_title.lower() or "Page not found" in wiki_page_content:
print("❌ ERROR: Wiki page does not exist at /wiki/europe-travel-budget-guide", file=sys.stderr)
verification_passed = False
else:
print("✓ Wiki page exists at /wiki/europe-travel-budget-guide", file=sys.stderr)
# Check wiki title
wiki_title_found = False
expected_wiki_title = "Complete Budget Travel Guide for Europe 2024"
# Try multiple selectors for wiki title
wiki_title_selectors = [
f'h1:has-text("{expected_wiki_title}")',
f'h1:text-is("{expected_wiki_title}")',
'h1'
]
for selector in wiki_title_selectors:
wiki_title_elem = page.locator(selector)
if await wiki_title_elem.count():
title_text = await wiki_title_elem.first.text_content()
if expected_wiki_title in title_text:
wiki_title_found = True
break
if not wiki_title_found:
print(f"❌ ERROR: Wiki title '{expected_wiki_title}' not found", file=sys.stderr)
verification_passed = False
else:
print(f"✓ Wiki title correct: '{expected_wiki_title}'", file=sys.stderr)
# Check for required content in wiki
required_wiki_content = "Eurail passes and budget airlines"
if required_wiki_content not in wiki_page_content:
print(f"❌ ERROR: Wiki content must contain '{required_wiki_content}'", file=sys.stderr)
verification_passed = False
else:
print(f"✓ Wiki content contains required text: '{required_wiki_content}'", file=sys.stderr)
# 4. Check for post in the forum
print("\n" + "="*60)
print("Step 4: Checking for post in forum...", file=sys.stderr)
print("="*60)
await page.goto(f"{BASE_URL}/f/BudgetEuropeTravel", wait_until='networkidle')
expected_post_title = "My 14-day Europe trip for under 1000 - Complete itinerary"
post_link = page.locator(f'a:has-text("{expected_post_title}")')
if not await post_link.count():
print(f"❌ ERROR: Post with title '{expected_post_title}' not found in forum", file=sys.stderr)
verification_passed = False
else:
print(f"✓ Post found with title: '{expected_post_title}'", file=sys.stderr)
# Click on the post to check its content
await post_link.first.click()
await page.wait_for_load_state('networkidle')
# Check if post contains required text
post_page_content = await page.content()
required_post_content = "budget guide wiki"
if required_post_content not in post_page_content:
print(f"❌ ERROR: Post body must contain '{required_post_content}'", file=sys.stderr)
verification_passed = False
else:
print(f"✓ Post content contains required text: '{required_post_content}'", file=sys.stderr)
# 5. Check upvote on search result
print("\n" + "="*60)
print("Step 5: Checking upvote on search result...", file=sys.stderr)
print("="*60)
# Navigate to search results for "travel insurance Europe"
await page.goto(f"{BASE_URL}/search?q=travel+insurance+Europe", wait_until='networkidle')
# Check if we're on search results page
if "/search" not in page.url:
print("❌ ERROR: Not on search results page", file=sys.stderr)
verification_passed = False
else:
print("✓ On search results page for 'travel insurance Europe'", file=sys.stderr)
# Check for upvoted posts
upvote_found = False
# Method 1: Check for "Retract upvote" button (indicates user has upvoted)
retract_buttons = page.locator('button:has-text("Retract upvote")')
if await retract_buttons.count() > 0:
print("✓ Found upvoted post (Retract upvote button present)", file=sys.stderr)
upvote_found = True
# Method 2: Check for posts with upvote count >= 1
if not upvote_found:
# Look for vote counts
vote_elements = page.locator('div.vote, span.vote-count, [class*="vote"]')
for i in range(await vote_elements.count()):
vote_elem = vote_elements.nth(i)
vote_text = await vote_elem.text_content()
try:
# Extract number from vote text
import re
numbers = re.findall(r'\d+', vote_text)
if numbers:
vote_count = int(numbers[0])
if vote_count >= 1:
print(f"✓ Found post with {vote_count} upvote(s)", file=sys.stderr)
upvote_found = True
break
except:
continue
if not upvote_found:
print("❌ ERROR: No upvoted posts found in search results", file=sys.stderr)
verification_passed = False
# 6. Check user settings
print("\n" + "="*60)
print("Step 6: Checking user settings...", file=sys.stderr)
print("="*60)
await page.goto(f"{BASE_URL}/user/EuroTravelPlanner/preferences", wait_until='networkidle')
# Check timezone setting
timezone_correct = False
timezone_select = page.locator('select[name*="timezone"], select#timezone')
if await timezone_select.count():
selected_value = await timezone_select.input_value()
if selected_value == "Europe/Amsterdam":
print("✓ Timezone correctly set to 'Europe/Amsterdam'", file=sys.stderr)
timezone_correct = True
else:
# Check selected option text
selected_option = timezone_select.locator('option[selected]')
if await selected_option.count():
option_text = await selected_option.text_content()
if "Amsterdam" in option_text:
print("✓ Timezone correctly set to Europe/Amsterdam", file=sys.stderr)
timezone_correct = True
else:
print(f"❌ ERROR: Timezone is set to '{option_text}', expected 'Europe/Amsterdam'", file=sys.stderr)
verification_passed = False
else:
print(f"❌ ERROR: Timezone is '{selected_value}', expected 'Europe/Amsterdam'", file=sys.stderr)
verification_passed = False
else:
print("❌ ERROR: Cannot find timezone selector", file=sys.stderr)
verification_passed = False
# Check "Notify on reply" setting
notify_correct = False
# Try multiple selectors for the checkbox
notify_selectors = [
'input[type="checkbox"]:near(:text("Notify on reply"))',
'label:has-text("Notify on reply") input[type="checkbox"]',
'input[type="checkbox"][name*="notify"]',
'input[type="checkbox"][id*="notify"]'
]
for selector in notify_selectors:
notify_checkbox = page.locator(selector)
if await notify_checkbox.count():
is_checked = await notify_checkbox.first.is_checked()
if is_checked:
print("✓ 'Notify on reply' is enabled (checked)", file=sys.stderr)
notify_correct = True
else:
print("❌ ERROR: 'Notify on reply' is not enabled (unchecked)", file=sys.stderr)
verification_passed = False
break
if not notify_correct and verification_passed:
print("❌ ERROR: Cannot verify 'Notify on reply' setting", file=sys.stderr)
verification_passed = False
# Final summary
print("\n" + "="*60)
if verification_passed:
print("✅ SUCCESS: All verification checks passed!", file=sys.stderr)
else:
print("❌ FAILED: One or more verification checks failed!", file=sys.stderr)
print("="*60)
return verification_passed
except PlaywrightTimeoutError as e:
print(f"❌ ERROR: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"❌ ERROR: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/description.md
================================================
Research durable kitchen appliances on the BuyItForLife forum by extracting information about the most popular posts and their community engagement.
**Task Requirements:**
1. Register a new account with username 'buyitforlife_researcher' and password 'BIFL2024Research!'
2. After successful registration, navigate to the BuyItForLife forum
3. Find the top 3 posts with the highest upvote counts and extract:
- Exact post title
- Exact upvote count
- Exact number of comments
4. For the post with the highest upvotes from step 3, click through to read the comments and find the comment with the most upvotes. Record:
- The exact comment text
- The username of the commenter
5. From the BuyItForLife forum, identify the usernames of the authors who posted the top 3 posts with the highest number of upvotes
6. After collecting all the data, submit a new post in the BuyItForLife forum with:
- Title: "Research Report for BuyItForLife"
- Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):
```
- Post1_Title|FILL_VALUE
- Post1_Upvotes|FILL_VALUE
- Post1_Comments|FILL_VALUE
- Post2_Title|FILL_VALUE
- Post2_Upvotes|FILL_VALUE
- Post2_Comments|FILL_VALUE
- Post3_Title|FILL_VALUE
- Post3_Upvotes|FILL_VALUE
- Post3_Comments|FILL_VALUE
- TopComment_Text|FILL_VALUE
- TopComment_Username|FILL_VALUE
- Post1_Author|FILL_VALUE
- Post2_Author|FILL_VALUE
- Post3_Author|FILL_VALUE
```
================================================
FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/label.txt
================================================
- Post1_Title|Hand me down name tag trail on this child's jacket
- Post1_Upvotes|14487
- Post1_Comments|163
- Post2_Title|My Grandmother's oven, circa 1966. Many holiday meals were cooked with love here, right up until the day she passed. Aside from one shattered glass door that was replaced, everything still works!
- Post2_Upvotes|6413
- Post2_Comments|205
- Post3_Title|1956 Frigidaire Range
- Post3_Upvotes|5797
- Post3_Comments|190
- TopComment_Text|Rei does this too
- TopComment_Username|ATeaformeplease
- Post1_Author|Mofomania
- Post2_Author|_Mr_Roboto_
- Post3_Author|dezualy
================================================
FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/meta.json
================================================
{
"task_id": "buyitforlife_research",
"task_name": "Buy It For Life Research",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Research durable lifetime-quality products by creating forum account, analyzing community recommendations, extracting product data, and compiling comprehensive durability report with voting metrics.",
"author": "Fanqing Meng",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"user interaction",
"data extraction",
"search aggregation",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/reddit/buyitforlife_research/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
def parse_markdown_list_format(text):
"""
Parse the markdown list format (- Key|Value) from the submission body.
Also handles plain Key|Value format without the dash.
"""
data = {}
# Pattern to match lines like: - Key|Value or just Key|Value
pattern = r'^(?:-\s*)?([^|]+)\|(.+)$'
lines = text.strip().split('\n')
for line in lines:
line = line.strip()
if '|' in line:
match = re.match(pattern, line)
if match:
key = match.group(1).strip()
value = match.group(2).strip()
data[key] = value
return data
def normalize_text(text):
"""
Normalize text for comparison by handling different quote styles and whitespace.
"""
if not isinstance(text, str):
return str(text)
# Replace various quote styles with standard quotes
text = text.replace(""", "'").replace(""", "'")
text = text.replace('"', '"').replace('"', '"')
# Normalize whitespace
text = " ".join(text.split())
return text.strip()
async def check_account_login(page):
"""
Check if the account can be successfully logged in.
Returns True if login successful, False otherwise.
"""
try:
# Navigate to main page
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
# Check if already logged in
user_button = page.locator('button[class*="UserButton"]')
if await user_button.count():
username = await user_button.inner_text()
if username.lower() == "buyitforlife_researcher":
print(f"Already logged in as {username}", file=sys.stderr)
return True
else:
# Log out first
await user_button.click()
await page.click('a:has-text("Log out")')
await page.wait_for_load_state("networkidle")
# Try to login
await page.click('a:has-text("Log in")')
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', "buyitforlife_researcher")
await page.fill('input[name="_password"]', "BIFL2024Research!")
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
# Check if login successful
user_button = page.locator('button:has-text("buyitforlife_researcher")')
if await user_button.count():
print("Successfully logged in as buyitforlife_researcher", file=sys.stderr)
return True
else:
print("Login failed for buyitforlife_researcher", file=sys.stderr)
return False
except Exception as e:
print(f"Error during login check: {str(e)}", file=sys.stderr)
return False
async def check_submission_exists(page):
"""
Check if the submission exists in the BuyItForLife forum with correct content.
Returns (exists, content_data) tuple.
"""
try:
# Navigate to BuyItForLife forum
await page.goto(f"{BASE_URL}/f/BuyItForLife", wait_until="networkidle")
# Look for the post with correct title
print("Looking for post 'Research Report for BuyItForLife'...", file=sys.stderr)
post_locator = page.locator('a:has-text("Research Report for BuyItForLife")')
if not await post_locator.count():
print("Error: Could not find post with title 'Research Report for BuyItForLife'", file=sys.stderr)
return False, None
# Click on the post to view its content
await post_locator.first.click()
await page.wait_for_load_state("networkidle")
# Get the post content
post_content = None
selectors = [
'.PostFullItem-body',
'.Post-body',
'.PostItem-body',
'.item-RichText',
'[class*="RichText"]',
'div:has-text("Post1_Title")',
]
for selector in selectors:
post_content_element = page.locator(selector)
if await post_content_element.count():
# Get the text content, handling multiple elements if needed
if await post_content_element.count() > 1:
for i in range(await post_content_element.count()):
text = await post_content_element.nth(i).inner_text()
if "Post1_Title" in text:
post_content = text
print(f"Found post content using selector: {selector} (element {i})", file=sys.stderr)
break
else:
post_content = await post_content_element.first.inner_text()
print(f"Found post content using selector: {selector}", file=sys.stderr)
if post_content and "Post1_Title" in post_content:
break
if not post_content:
print("Error: Could not find post content element", file=sys.stderr)
return False, None
print("Post content found:", file=sys.stderr)
print(post_content[:200] + "..." if len(post_content) > 200 else post_content, file=sys.stderr)
# Parse the markdown list format
extracted_data = parse_markdown_list_format(post_content)
print(f"Extracted data: {extracted_data}", file=sys.stderr)
return True, extracted_data
except Exception as e:
print(f"Error checking submission: {str(e)}", file=sys.stderr)
return False, None
async def verify() -> bool:
"""
Verifies that the BuyItForLife research task has been completed correctly.
Checks:
1. Account creation (can login with credentials)
2. Submission exists with correct title
3. Submission content matches expected format and values
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Step 1: Check account creation
print("=== Step 1: Checking account creation ===", file=sys.stderr)
account_ok = await check_account_login(page)
if not account_ok:
print("Error: Account 'buyitforlife_researcher' cannot be logged in", file=sys.stderr)
return False
# Step 2: Check submission exists and get content
print("\n=== Step 2: Checking submission ===", file=sys.stderr)
submission_exists, extracted_data = await check_submission_exists(page)
if not submission_exists:
print("Error: Submission not found in BuyItForLife forum", file=sys.stderr)
return False
if not extracted_data:
print("Error: Could not extract data from submission", file=sys.stderr)
return False
# Step 3: Load expected data from label.txt
print("\n=== Step 3: Validating submission content ===", file=sys.stderr)
label_path = Path(__file__).parent / "label.txt"
if not label_path.exists():
print("Error: label.txt not found", file=sys.stderr)
return False
with open(label_path, "r") as f:
expected_text = f.read().strip()
expected_data = parse_markdown_list_format(expected_text)
print(f"Expected data from label.txt: {expected_data}", file=sys.stderr)
# Verify all required keys are present
required_keys = [
"Post1_Title",
"Post1_Upvotes",
"Post1_Comments",
"Post2_Title",
"Post2_Upvotes",
"Post2_Comments",
"Post3_Title",
"Post3_Upvotes",
"Post3_Comments",
"TopComment_Text",
"TopComment_Username",
"Post1_Author",
"Post2_Author",
"Post3_Author",
]
missing_keys = []
for key in required_keys:
if key not in extracted_data:
missing_keys.append(key)
if missing_keys:
print(f"Error: Missing required keys: {', '.join(missing_keys)}", file=sys.stderr)
return False
# Compare each field with expected values
errors = []
for key in required_keys:
if key in expected_data and key in extracted_data:
expected_val = normalize_text(expected_data[key])
actual_val = normalize_text(extracted_data[key])
# For numeric fields, compare as integers
if "Upvotes" in key or "Comments" in key:
try:
expected_int = int(expected_val)
actual_int = int(actual_val)
if expected_int != actual_int:
errors.append(f"{key} mismatch: got {actual_int}, expected {expected_int}")
except ValueError:
errors.append(f"{key} should be numeric: got '{actual_val}'")
else:
# For text fields, special handling for usernames with underscores
if "Author" in key or key == "TopComment_Username":
expected_core = expected_val.strip('_')
actual_core = actual_val.strip('_')
if expected_core != actual_core:
errors.append(f"{key} mismatch: got '{actual_val}', expected '{expected_val}'")
else:
if expected_val != actual_val:
errors.append(f"{key} mismatch: got '{actual_val}', expected '{expected_val}'")
# Verify upvotes are in descending order
try:
post1_votes = int(extracted_data["Post1_Upvotes"])
post2_votes = int(extracted_data["Post2_Upvotes"])
post3_votes = int(extracted_data["Post3_Upvotes"])
if not (post1_votes >= post2_votes >= post3_votes):
errors.append(f"Posts should be ordered by upvotes: {post1_votes} >= {post2_votes} >= {post3_votes}")
except (ValueError, KeyError):
pass # Already reported above
if errors:
print("Error: Validation failed with the following issues:", file=sys.stderr)
for error in errors:
print(f" - {error}", file=sys.stderr)
return False
# All checks passed
print("\n=== SUCCESS ===", file=sys.stderr)
print("✓ Account 'buyitforlife_researcher' created and can login", file=sys.stderr)
print("✓ Submission 'Research Report for BuyItForLife' found in correct forum", file=sys.stderr)
print("✓ All 14 required fields present and correct", file=sys.stderr)
print("✓ Data matches expected values from label.txt", file=sys.stderr)
print("✓ Posts ordered by upvotes (descending)", file=sys.stderr)
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/description.md
================================================
I need you to perform a comprehensive analysis of Large Language Model discussions on this forum platform and create a summary post. This analysis will help me understand the community's engagement with LLM topics.
**Task Requirements:**
1. Register a new account with username 'llm_analyst_2024' and password 'LLMExpert2024!'
2. After successful registration, navigate to the MachineLearning forum and analyze posts about Large Language Models:
- Count all posts on the first page that contain 'GPT', 'ChatGPT', or 'LLM' in their titles
3. Identify the top 3 posts with the highest upvote counts from your LLM-related posts list and record their complete details
4. Navigate to the deeplearning forum and go to page 2:
- Find the post with the most comments on page 2
- Record its exact title and comment count
5. Create a new submission in the MachineLearning forum with:
- Title: "LLM Research Summary: GPT Discussions Analysis [2024]"
- Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):
```
- Total_LLM_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Upvotes|FILL_VALUE
- Top1_Date|FILL_VALUE
- Top2_Title|FILL_VALUE
- Top2_Upvotes|FILL_VALUE
- Top2_Date|FILL_VALUE
- Top3_Title|FILL_VALUE
- Top3_Upvotes|FILL_VALUE
- Top3_Date|FILL_VALUE
- Deeplearning_MostDiscussed|FILL_VALUE
- Deeplearning_Comments|FILL_VALUE
```
================================================
FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/label.txt
================================================
- Total_LLM_Posts|9
- Top1_Title|[P] I made a command-line tool that explains your errors using ChatGPT (link in comments)
- Top1_Upvotes|2655
- Top1_Date|3 years ago
- Top2_Title|[P] I built Adrenaline, a debugger that fixes errors and explains them with GPT-3
- Top2_Upvotes|1542
- Top2_Date|3 years ago
- Top3_Title|[N] OpenAI may have benchmarked GPT-4's coding ability on it's own training data
- Top3_Upvotes|925
- Top3_Date|2 years ago
- Deeplearning_MostDiscussed|Do companies actually care about their model's training/inference speed?
- Deeplearning_Comments|39
================================================
FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/meta.json
================================================
{
"task_id": "llm_research_summary",
"task_name": "LLM Research Summary",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Aggregate and analyze LLM research discussions across multiple forums, collect trending topics, compile technical insights, and create comprehensive summary post with community engagement.",
"author": "Fanqing Meng",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"data extraction",
"search aggregation",
"content submission",
"user interaction"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/reddit/llm_research_summary/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
# 从环境变量读取 base_url,默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
def parse_key_value_format(text):
"""
Parse the Key|Value format from the submission body.
Handles both pipe (|) and colon (:) separators for compatibility.
"""
data = {}
# Try to parse with pipe separator first (expected format)
lines = text.strip().split('\n')
for line in lines:
line = line.strip()
if not line:
continue
# Remove markdown list prefix if present
if line.startswith('- '):
line = line[2:]
elif line.startswith('* '):
line = line[2:]
# Try pipe separator first
if '|' in line:
parts = line.split('|', 1)
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip()
data[key] = value
# Fallback to colon separator for label.txt compatibility
elif ':' in line:
parts = line.split(':', 1)
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip()
data[key] = value
return data
def normalize_text(text):
"""
Normalize text for comparison by handling different quote styles and whitespace.
"""
if not isinstance(text, str):
return str(text)
# Replace various quote styles with standard quotes
text = text.replace(""", "'").replace(""", "'")
text = text.replace('"', '"').replace('"', '"')
# Normalize whitespace
text = " ".join(text.split())
return text.strip()
async def verify() -> bool:
"""
Verifies that the LLM analysis task has been completed correctly.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Navigate to the main page
print("Navigating to forum...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
# Check if logged in as llm_analyst_2024
user_button = page.locator('button:has-text("llm_analyst_2024")')
if not await user_button.count():
# Try to login
print("Not logged in, attempting to login...", file=sys.stderr)
await page.click('a:has-text("Log in")')
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', "llm_analyst_2024")
await page.fill('input[name="_password"]', "LLMExpert2024!")
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
user_button = page.locator('button:has-text("llm_analyst_2024")')
if not await user_button.count():
print("Error: Login failed for llm_analyst_2024", file=sys.stderr)
return False
print("Successfully logged in as llm_analyst_2024", file=sys.stderr)
else:
print("Already logged in as llm_analyst_2024", file=sys.stderr)
# Navigate to MachineLearning forum
print("Navigating to MachineLearning forum...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/f/MachineLearning", wait_until="networkidle"
)
# Look for the submission with our specific title
print(
"Looking for submission 'LLM Research Summary: GPT Discussions Analysis [2024]'...",
file=sys.stderr,
)
post_link = page.locator(
'a:has-text("LLM Research Summary: GPT Discussions Analysis [2024]")'
)
if not await post_link.count():
print(
"Error: Could not find submission with required title",
file=sys.stderr,
)
return False
# Click on the submission to view its content
await post_link.first.click()
await page.wait_for_load_state("networkidle")
# Extract the submission body content
# Try multiple possible selectors for the post body
post_content = None
selectors = [
".submission__body",
".post-body",
".RichText",
'[class*="RichText"]',
'div:has(> p:has-text("Total_LLM_Posts"))',
'div:has-text("Total_LLM_Posts"):has-text("Deeplearning_Comments")',
]
for selector in selectors:
content_element = page.locator(selector)
if await content_element.count():
post_content = await content_element.first.inner_text()
if "Total_LLM_Posts" in post_content:
print(
f"Found submission content using selector: {selector}",
file=sys.stderr,
)
break
if not post_content or "Total_LLM_Posts" not in post_content:
print(
"Error: Could not find submission body with required format",
file=sys.stderr,
)
return False
print("Submission content found, parsing data...", file=sys.stderr)
print(f"Raw content: {post_content[:200]}...", file=sys.stderr)
# Parse the Key: Value format
extracted_data = parse_key_value_format(post_content)
print(f"Extracted data: {extracted_data}", file=sys.stderr)
# Load expected values from label.txt
label_path = Path(__file__).parent / "label.txt"
if label_path.exists():
with open(label_path, "r") as f:
expected_text = f.read().strip()
expected_data = parse_key_value_format(expected_text)
print("Loaded expected values from label.txt", file=sys.stderr)
# Verify all required keys are present
required_keys = [
"Total_LLM_Posts",
"Top1_Title",
"Top1_Upvotes",
"Top1_Date",
"Top2_Title",
"Top2_Upvotes",
"Top2_Date",
"Top3_Title",
"Top3_Upvotes",
"Top3_Date",
"Deeplearning_MostDiscussed",
"Deeplearning_Comments",
]
missing_keys = []
for key in required_keys:
if key not in extracted_data:
missing_keys.append(key)
if missing_keys:
print(
f"Error: Missing required keys: {', '.join(missing_keys)}",
file=sys.stderr,
)
return False
# Validate data format and content
errors = []
# Check Total_LLM_Posts is a number and matches expected
try:
total_posts = int(extracted_data["Total_LLM_Posts"])
if "expected_data" in locals() and "Total_LLM_Posts" in expected_data:
expected_total = int(expected_data["Total_LLM_Posts"])
if total_posts != expected_total:
errors.append(
f"Total_LLM_Posts mismatch: got {total_posts}, expected {expected_total}"
)
elif total_posts < 5: # Based on exploration, should be at least 5
errors.append(f"Total_LLM_Posts seems too low: {total_posts}")
except ValueError:
errors.append(
f"Total_LLM_Posts must be a number, got: {extracted_data['Total_LLM_Posts']}"
)
# If we have expected data, compare against it
if "expected_data" in locals():
# Compare each field
for key in required_keys:
if key in expected_data and key in extracted_data:
expected_val = normalize_text(expected_data[key])
actual_val = normalize_text(extracted_data[key])
# For numeric fields, compare as integers
if (
"Upvotes" in key
or "Comments" in key
or key == "Total_LLM_Posts"
):
try:
expected_int = int(expected_val)
actual_int = int(actual_val)
if expected_int != actual_int:
errors.append(
f"{key} mismatch: got {actual_int}, expected {expected_int}"
)
except ValueError:
errors.append(
f"{key} should be numeric: got '{actual_val}'"
)
else:
# For text fields, compare normalized text
if expected_val != actual_val:
errors.append(
f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
)
else:
# If no expected data, just do basic validation
for key in required_keys:
if key not in extracted_data:
errors.append(f"Missing required key: {key}")
elif (
not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
):
errors.append(f"{key} was not filled in")
# Verify upvotes are in descending order for top 3
try:
top1_votes = int(extracted_data["Top1_Upvotes"])
top2_votes = int(extracted_data["Top2_Upvotes"])
top3_votes = int(extracted_data["Top3_Upvotes"])
if not (top1_votes >= top2_votes >= top3_votes):
errors.append(
f"Top posts should be ordered by upvotes: {top1_votes} >= {top2_votes} >= {top3_votes}"
)
except (ValueError, KeyError):
pass # Already reported above
if errors:
print(
"Error: Validation failed with the following issues:",
file=sys.stderr,
)
for error in errors:
print(f" - {error}", file=sys.stderr)
return False
# All checks passed
print("Success: LLM analysis task completed successfully.")
print("- Account llm_analyst_2024 verified")
print(
"- Submission 'LLM Research Summary: GPT Discussions Analysis [2024]' found"
)
print(
f"- Total LLM-related posts analyzed: {extracted_data['Total_LLM_Posts']}"
)
print("- Top 3 posts by upvotes identified and documented")
print(
f"- Deeplearning forum page 2 most discussed post: {extracted_data['Deeplearning_MostDiscussed']}"
)
print("- All data in correct Key: Value format with 12 lines")
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/description.md
================================================
I need you to analyze wonderful movie discussions on Postmill and create a summary post. This analysis will help me understand the community's engagement with movie topics.
**Task Requirements:**
1. Register a new account with username 'movie_reviewer_2024' and password 'movie_reviewer_2024'
2. After successful registration, navigate to the movies forum and analyze the posts on the first page:
- Count all posts that have 4-digit years in their titles
3. Identify the top 3 posts with the highest upvote counts from ALL posts on the first page and record their complete details
4. Find these specific posts and record their data:
- The 'Rittenhouse Square' poster post (record exact upvotes and comments)
5. Navigate through the first 5 pages of movies forum and count the total number of image/poster submissions across all 5 pages
6. Create a new submission in the movies forum with:
- Title: "Wonderful Movies Analysis: Community Favorites [2024]"
- Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the pipe, follow the markdown format):
```
- Total_Year_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Upvotes|FILL_VALUE
- Top1_Comments|FILL_VALUE
- Top2_Title|FILL_VALUE
- Top2_Upvotes|FILL_VALUE
- Top2_Comments|FILL_VALUE
- Top3_Title|FILL_VALUE
- Top3_Upvotes|FILL_VALUE
- Top3_Comments|FILL_VALUE
- Rittenhouse_Upvotes|FILL_VALUE
- Rittenhouse_Comments|FILL_VALUE
- Total_Image_Posts_5Pages|FILL_VALUE
```
================================================
FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/label.txt
================================================
- Total_Year_Posts|1
- Top1_Title|Who will win the Oscar for ACTRESS IN A SUPPORTING ROLE?
- Top1_Upvotes|9933
- Top1_Comments|23
- Top2_Title|Who will win the Oscar for FILM EDITING?
- Top2_Upvotes|7720
- Top2_Comments|20
- Top3_Title|Cindy Williams Dies: 'Laverne & Shirley' Star Who Appeared In 'American Graffiti' & 'The Conversation' Was 75
- Top3_Upvotes|5268
- Top3_Comments|190
- Rittenhouse_Upvotes|2761
- Rittenhouse_Comments|182
- Total_Image_Posts_5Pages|6
================================================
FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/meta.json
================================================
{
"task_id": "movie_reviewer_analysis",
"task_name": "Movie Reviewer Analysis",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Analyze movie review patterns by creating reviewer profile, collecting ratings data, tracking review trends, and generating analytical report on community movie preferences and discussions.",
"author": "Fanqing Meng",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"user interaction",
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/reddit/movie_reviewer_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
# 从环境变量读取 base_url,默认回退到原地址
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
print(f"Using base URL: {BASE_URL}")
def parse_key_value_format(text):
"""
Parse the Key|Value format from the submission body using regex.
Works regardless of line breaks.
"""
data = {}
# Define patterns for each field with the pipe separator
patterns = {
"Total_Year_Posts": r"Total_Year_Posts\s*\|\s*(\d+)",
"Top1_Title": r"Top1_Title\s*\|\s*(.+?)(?=\nTop1_Upvotes|$)",
"Top1_Upvotes": r"Top1_Upvotes\s*\|\s*(\d+)",
"Top1_Comments": r"Top1_Comments\s*\|\s*(\d+)",
"Top2_Title": r"Top2_Title\s*\|\s*(.+?)(?=\nTop2_Upvotes|$)",
"Top2_Upvotes": r"Top2_Upvotes\s*\|\s*(\d+)",
"Top2_Comments": r"Top2_Comments\s*\|\s*(\d+)",
"Top3_Title": r"Top3_Title\s*\|\s*(.+?)(?=\nTop3_Upvotes|$)",
"Top3_Upvotes": r"Top3_Upvotes\s*\|\s*(\d+)",
"Top3_Comments": r"Top3_Comments\s*\|\s*(\d+)",
"Rittenhouse_Upvotes": r"Rittenhouse_Upvotes\s*\|\s*(\d+)",
"Rittenhouse_Comments": r"Rittenhouse_Comments\s*\|\s*(\d+)",
"Total_Image_Posts_5Pages": r"Total_Image_Posts_5Pages\s*\|\s*(\d+)",
}
# Extract each field using regex
for key, pattern in patterns.items():
match = re.search(pattern, text, re.DOTALL | re.MULTILINE)
if match:
# For title fields, clean up newlines and extra spaces
value = match.group(1).strip()
if "Title" in key:
# Replace newlines with spaces and normalize whitespace
value = " ".join(value.split())
data[key] = value
return data
def normalize_text(text):
"""
Normalize text for comparison by handling different quote styles and whitespace.
"""
if not isinstance(text, str):
return str(text)
# Replace various quote styles with standard quotes
text = text.replace(""", "'").replace(""", "'")
text = text.replace('"', '"').replace('"', '"')
text = text.replace("&", "&")
# Normalize whitespace
text = " ".join(text.split())
return text.strip()
async def verify() -> bool:
"""
Verifies that the wonderful movies analysis task has been completed correctly.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Navigate to the main page
print("Navigating to forum...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
# Check if logged in as movie_reviewer_2024
user_button = page.locator('button:has-text("movie_reviewer_2024")')
if not await user_button.count():
# Try to login
print("Not logged in, attempting to login...", file=sys.stderr)
await page.click('a:has-text("Log in")')
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', "movie_reviewer_2024")
await page.fill('input[name="_password"]', "movie_reviewer_2024")
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
user_button = page.locator('button:has-text("movie_reviewer_2024")')
if not await user_button.count():
print(
"Error: Login failed for movie_reviewer_2024", file=sys.stderr
)
return False
print("Successfully logged in as movie_reviewer_2024", file=sys.stderr)
else:
print("Already logged in as movie_reviewer_2024", file=sys.stderr)
# Navigate to movies forum
print("Navigating to movies forum...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/f/movies", wait_until="networkidle"
)
# Look for the submission with our specific title
print(
"Looking for submission 'Wonderful Movies Analysis: Community Favorites [2024]'...",
file=sys.stderr,
)
post_link = page.locator(
'a:has-text("Wonderful Movies Analysis: Community Favorites [2024]")'
)
if not await post_link.count():
print(
"Error: Could not find submission with required title",
file=sys.stderr,
)
return False
# Click on the submission to view its content
await post_link.first.click()
await page.wait_for_load_state("networkidle")
# Extract the submission body content
# Try multiple possible selectors for the post body
post_content = None
selectors = [
".submission__body",
".post-body",
".RichText",
'[class*="RichText"]',
'div:has(> p:has-text("Total_Year_Posts"))',
'div:has-text("Total_Year_Posts"):has-text("Total_Image_Posts_5Pages")',
]
for selector in selectors:
content_element = page.locator(selector)
if await content_element.count():
post_content = await content_element.first.inner_text()
if "Total_Year_Posts" in post_content:
print(
f"Found submission content using selector: {selector}",
file=sys.stderr,
)
break
if not post_content or "Total_Year_Posts" not in post_content:
print(
"Error: Could not find submission body with required format",
file=sys.stderr,
)
return False
print("Submission content found, parsing data...", file=sys.stderr)
print(f"Raw content: {post_content[:200]}...", file=sys.stderr)
# Parse the Key: Value format
extracted_data = parse_key_value_format(post_content)
print(f"Extracted data: {extracted_data}", file=sys.stderr)
# Load expected values from label.txt
label_path = Path(__file__).parent / "label.txt"
if label_path.exists():
with open(label_path, "r") as f:
expected_text = f.read().strip()
expected_data = parse_key_value_format(expected_text)
print("Loaded expected values from label.txt", file=sys.stderr)
# Verify all required keys are present
required_keys = [
"Total_Year_Posts",
"Top1_Title",
"Top1_Upvotes",
"Top1_Comments",
"Top2_Title",
"Top2_Upvotes",
"Top2_Comments",
"Top3_Title",
"Top3_Upvotes",
"Top3_Comments",
"Rittenhouse_Upvotes",
"Rittenhouse_Comments",
"Total_Image_Posts_5Pages",
]
missing_keys = []
for key in required_keys:
if key not in extracted_data:
missing_keys.append(key)
if missing_keys:
print(
f"Error: Missing required keys: {', '.join(missing_keys)}",
file=sys.stderr,
)
return False
# Validate data format and content
errors = []
# Check Total_Year_Posts is a number and matches expected
try:
total_posts = int(extracted_data["Total_Year_Posts"])
if "expected_data" in locals() and "Total_Year_Posts" in expected_data:
expected_total = int(expected_data["Total_Year_Posts"])
if total_posts != expected_total:
errors.append(
f"Total_Year_Posts mismatch: got {total_posts}, expected {expected_total}"
)
except ValueError:
errors.append(
f"Total_Year_Posts must be a number, got: {extracted_data['Total_Year_Posts']}"
)
# If we have expected data, compare against it
if "expected_data" in locals():
# Compare each field
for key in required_keys:
if key in expected_data and key in extracted_data:
expected_val = normalize_text(expected_data[key])
actual_val = normalize_text(extracted_data[key])
# For numeric fields, compare as integers
if (
"Upvotes" in key
or "Comments" in key
or key == "Total_Year_Posts"
or key == "Total_Image_Posts_5Pages"
):
try:
expected_int = int(expected_val)
actual_int = int(actual_val)
if expected_int != actual_int:
errors.append(
f"{key} mismatch: got {actual_int}, expected {expected_int}"
)
except ValueError:
errors.append(
f"{key} should be numeric: got '{actual_val}'"
)
else:
# For text fields, compare normalized text
if expected_val != actual_val:
errors.append(
f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
)
else:
# If no expected data, just do basic validation
for key in required_keys:
if key not in extracted_data:
errors.append(f"Missing required key: {key}")
elif (
not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
):
errors.append(f"{key} was not filled in")
if errors:
print(
"Error: Validation failed with the following issues:",
file=sys.stderr,
)
for error in errors:
print(f" - {error}", file=sys.stderr)
return False
# All checks passed
print("Success: Wonderful movies analysis task completed successfully.")
print("- Account movie_reviewer_2024 verified")
print(
"- Submission 'Wonderful Movies Analysis: Community Favorites [2024]' found"
)
print(f"- Total posts with years: {extracted_data['Total_Year_Posts']}")
print("- Top 3 posts by upvotes identified and documented")
print(
f"- Rittenhouse Square data: {extracted_data['Rittenhouse_Upvotes']} upvotes, {extracted_data['Rittenhouse_Comments']} comments"
)
print(
f"- Total image posts across 5 pages: {extracted_data['Total_Image_Posts_5Pages']}"
)
print("- All data in correct Key|Value format")
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/description.md
================================================
I'm conducting research on NBA player discussions in online sports communities. Please help me create a comprehensive analysis.
**Task Requirements:**
1. Register a new account with username 'NBA_DataAnalyst_2024' and password 'Research#2024!'
2. Navigate to the sports forum and search for posts containing 'NBA' in their titles:
- Collect data from the 5 NBA-related posts with the most comments
- For each post, record: the exact post title, vote count, comment count, and the username of the person who submitted it
3. Visit the user profile of 'BCLetsRide69':
- Count his total submissions
4. Create a new submission in the sports forum with:
- Title: "Statistical Analysis: NBA Content Engagement on This Forum"
- Body text must be EXACTLY these lines without anything (keep the keys as-is, only replace the values after the colon, follow the markdown format):
```
- Total_NBA_Posts|FILL_VALUE
- Top1_Title|FILL_VALUE
- Top1_Votes|FILL_VALUE
- Top1_Comments|FILL_VALUE
- Top1_Author|FILL_VALUE
- Top2_Title|FILL_VALUE
- Top2_Votes|FILL_VALUE
- Top2_Comments|FILL_VALUE
- Top2_Author|FILL_VALUE
- Top3_Title|FILL_VALUE
- Top3_Votes|FILL_VALUE
- Top3_Comments|FILL_VALUE
- Top3_Author|FILL_VALUE
- Top4_Title|FILL_VALUE
- Top4_Votes|FILL_VALUE
- Top4_Comments|FILL_VALUE
- Top4_Author|FILL_VALUE
- Top5_Title|FILL_VALUE
- Top5_Votes|FILL_VALUE
- Top5_Comments|FILL_VALUE
- Top5_Author|FILL_VALUE
- BCLetsRide69_Total_Posts|FILL_VALUE
```
================================================
FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/label.txt
================================================
- Total_NBA_Posts|20
- Top1_Title|Hamby claims [WNBA Champ] Aces 'unprofessional' after trade
- Top1_Votes|614
- Top1_Comments|170
- Top1_Author|Responsible-Lunch815
- Top2_Title|Heat place literally every player on injury report after receiving NBA fine ahead of Mexico City game
- Top2_Votes|1266
- Top2_Comments|145
- Top2_Author|XXmynameisNeganXX
- Top3_Title|[ESPN] Announced attendance at the Alamodome tonight|68,323, a new single-game NBA record, in the Spurs' first game there since Game 4 of the 2002 Western Conference Semifinals.
- Top3_Votes|1511
- Top3_Comments|101
- Top3_Author|dragon8811
- Top4_Title|Phoenix Mercury confirm Brittney Griner’s return to WNBA
- Top4_Votes|0
- Top4_Comments|42
- Top4_Author|rejs7
- Top5_Title|Perspective | Kyrie Irving lit a flame. The NBA, top to bottom, watched the fire spread.
- Top5_Votes|74
- Top5_Comments|32
- Top5_Author|tomyland
- BCLetsRide69_Total_Posts|48
================================================
FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/meta.json
================================================
{
"task_id": "nba_statistics_analysis",
"task_name": "NBA Statistics Analysis",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Create sports analytics account, collect NBA player statistics from forum discussions, analyze basketball performance metrics, and compile comprehensive statistical report with community insights.",
"author": "Fanqing Meng",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"user interaction",
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/reddit/nba_statistics_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
from pathlib import Path
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
# 从环境变量读取 base_url,默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
def parse_key_value_format(text):
"""
Parse the Key|Value format from the submission body.
This handles both the expected format from label.txt and the submission format.
"""
data = {}
# Split by lines and parse each line
lines = text.strip().split('\n')
for line in lines:
line = line.strip()
if not line or line.startswith('#'):
continue
# Remove bullet point if present
if line.startswith('- '):
line = line[2:]
elif line.startswith('• '):
line = line[2:]
# Parse pipe-separated format
if '|' in line:
parts = line.split('|', 1)
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip()
if value and value != 'FILL_VALUE':
data[key] = value
return data
def normalize_text(text):
"""
Normalize text for comparison by handling different quote styles and whitespace.
"""
if not isinstance(text, str):
return str(text)
# Replace various quote styles with standard quotes
text = text.replace(""", "'").replace(""", "'")
text = text.replace('"', '"').replace('"', '"')
# Also normalize apostrophes - use unicode escapes to be safe
text = text.replace("\u2019", "'") # RIGHT SINGLE QUOTATION MARK (')
text = text.replace("\u2018", "'") # LEFT SINGLE QUOTATION MARK (')
# Normalize whitespace
text = " ".join(text.split())
return text.strip()
async def verify() -> bool:
"""
Verifies that the NBA analysis task has been completed correctly.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Navigate to the main page
print("Navigating to forum...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
# Check if logged in as NBA_DataAnalyst_2024
user_button = page.locator('button:has-text("NBA_DataAnalyst_2024")')
if not await user_button.count():
# Try to login
print("Not logged in, attempting to login...", file=sys.stderr)
await page.click('a:has-text("Log in")')
await page.wait_for_load_state("networkidle")
await page.fill('input[name="_username"]', "NBA_DataAnalyst_2024")
await page.fill('input[name="_password"]', "Research#2024!")
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
user_button = page.locator('button:has-text("NBA_DataAnalyst_2024")')
if not await user_button.count():
print(
"Error: Login failed for NBA_DataAnalyst_2024", file=sys.stderr
)
return False
print("Successfully logged in as NBA_DataAnalyst_2024", file=sys.stderr)
else:
print("Already logged in as NBA_DataAnalyst_2024", file=sys.stderr)
# Navigate to sports forum to check submission
print("Navigating to sports forum to check submission...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/f/sports", wait_until="networkidle"
)
# Look for the submission with our specific title
print(
"Looking for submission 'Statistical Analysis: NBA Content Engagement on This Forum'...",
file=sys.stderr,
)
post_link = page.locator(
'a:has-text("Statistical Analysis: NBA Content Engagement on This Forum")'
)
if not await post_link.count():
print(
"Error: Could not find submission with required title",
file=sys.stderr,
)
return False
# Click on the submission to view its content
await post_link.first.click()
await page.wait_for_load_state("networkidle")
# Extract the submission body content
# Try multiple possible selectors for the post body
post_content = None
selectors = [
".submission__body",
".post-body",
".RichText",
'[class*="RichText"]',
'div:has(> p:has-text("Total_NBA_Posts"))',
'div:has-text("Total_NBA_Posts"):has-text("Most_Popular_NBA_Author")',
]
for selector in selectors:
content_element = page.locator(selector)
if await content_element.count():
post_content = await content_element.first.inner_text()
if "Total_NBA_Posts" in post_content:
print(
f"Found submission content using selector: {selector}",
file=sys.stderr,
)
break
if not post_content or "Total_NBA_Posts" not in post_content:
print(
"Error: Could not find submission body with required format",
file=sys.stderr,
)
return False
print("Submission content found, parsing data...", file=sys.stderr)
print(f"Raw content: {post_content[:200]}...", file=sys.stderr)
# Parse the Key: Value format
extracted_data = parse_key_value_format(post_content)
print(f"Extracted data: {extracted_data}", file=sys.stderr)
# Load expected values from label.txt
label_path = Path(__file__).parent / "label.txt"
if label_path.exists():
with open(label_path, "r") as f:
expected_text = f.read().strip()
expected_data = parse_key_value_format(expected_text)
print("Loaded expected values from label.txt", file=sys.stderr)
# Verify all required keys are present
required_keys = [
"Total_NBA_Posts",
"Top1_Title",
"Top1_Votes",
"Top1_Comments",
"Top1_Author",
"Top2_Title",
"Top2_Votes",
"Top2_Comments",
"Top2_Author",
"Top3_Title",
"Top3_Votes",
"Top3_Comments",
"Top3_Author",
"Top4_Title",
"Top4_Votes",
"Top4_Comments",
"Top4_Author",
"Top5_Title",
"Top5_Votes",
"Top5_Comments",
"Top5_Author",
"BCLetsRide69_Total_Posts",
]
missing_keys = []
for key in required_keys:
if key not in extracted_data:
missing_keys.append(key)
if missing_keys:
print(
f"Error: Missing required keys: {', '.join(missing_keys)}",
file=sys.stderr,
)
return False
# Validate data format and content
errors = []
# Check Total_NBA_Posts is a number and matches expected
try:
total_posts = int(extracted_data["Total_NBA_Posts"])
if "expected_data" in locals() and "Total_NBA_Posts" in expected_data:
expected_total = int(expected_data["Total_NBA_Posts"])
if total_posts != expected_total:
errors.append(
f"Total_NBA_Posts mismatch: got {total_posts}, expected {expected_total}"
)
elif (
total_posts < 5
): # Should be at least 5 since we're collecting top 5
errors.append(f"Total_NBA_Posts seems too low: {total_posts}")
except ValueError:
errors.append(
f"Total_NBA_Posts must be a number, got: {extracted_data['Total_NBA_Posts']}"
)
# If we have expected data, compare against it
if "expected_data" in locals():
# Compare each field
for key in required_keys:
if key in expected_data and key in extracted_data:
expected_val = normalize_text(expected_data[key])
actual_val = normalize_text(extracted_data[key])
# For numeric fields, compare as integers
if (
"Votes" in key
or "Comments" in key
or key == "Total_NBA_Posts"
or key == "BCLetsRide69_Total_Posts"
):
try:
expected_int = int(expected_val)
actual_int = int(actual_val)
if expected_int != actual_int:
errors.append(
f"{key} mismatch: got {actual_int}, expected {expected_int}"
)
except ValueError:
errors.append(
f"{key} should be numeric: got '{actual_val}'"
)
else:
# For text fields, compare normalized text
if expected_val != actual_val:
errors.append(
f"{key} mismatch: got '{actual_val}', expected '{expected_val}'"
)
else:
# If no expected data, just do basic validation
for key in required_keys:
if key not in extracted_data:
errors.append(f"Missing required key: {key}")
elif (
not extracted_data[key] or extracted_data[key] == "[FILL_VALUE]"
):
errors.append(f"{key} was not filled in")
if errors:
print(
"Error: Validation failed with the following issues:",
file=sys.stderr,
)
for error in errors:
print(f" - {error}", file=sys.stderr)
return False
# All checks passed
print("Success: NBA analysis task completed successfully.")
print("- Account NBA_DataAnalyst_2024 verified")
print(
"- Submission 'Statistical Analysis: NBA Content Engagement on This Forum' found"
)
print(
f"- Total NBA-related posts analyzed: {extracted_data['Total_NBA_Posts']}"
)
print("- Top 5 posts identified and documented")
print(
f"- BCLetsRide69's total posts: {extracted_data['BCLetsRide69_Total_Posts']}"
)
print("- All data in correct Key|Value format")
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/description.md
================================================
Create a daily routine tracking system on the Postmill forum platform by setting up a new account and engaging with productivity-related content in the LifeProTips community.
**Task Requirements:**
1. Create a new account with the exact username "RoutineTracker2025" and password "DailyRoutine123!"
2. Locate the following specific post:
- Title: "LPT: Use your calendar as your to-do list. Assigning dedicated time to tasks increases the likelyhood of you acting upon it."
- Record the content of the most upvoted comment of this post
3. Create a new submission in the LifeProTips forum with these exact specifications:
- Title: "My 5-Step Morning Routine That Increased My Productivity by 200%"
- Body content must be the content recorded in the above step
4. After successfully posting, verify that your post appears in the LifeProTips forum listing
5. Engage with the community by upvoting exactly these two posts:
- The calendar to-do list post you found in step 2
- The post titled "LPT: clean your stovetop after using the oven. The heat loosens grime for easy removal"
================================================
FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/meta.json
================================================
{
"task_id": "routine_tracker_forum",
"task_name": "Routine Tracker Forum",
"category_id": "reddit",
"category_name": "Reddit",
"description": "Establish productivity community by creating account, setting up routine tracking forum, implementing daily habit systems, and engaging members with structured productivity challenges and resources.",
"author": "Fanqing Meng",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"user interaction",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/postmill.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/reddit/routine_tracker_forum/verify.py
================================================
import asyncio
import sys
import os
from pathlib import Path
from datetime import datetime
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:9999").rstrip("/")
async def verify() -> bool:
"""
Verifies that the daily routine tracking setup has been completed correctly on the forum.
"""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Step 1: Check if account can be logged in
print("Step 1: Verifying account login...", file=sys.stderr)
await page.goto(f"{BASE_URL}/", wait_until="networkidle")
# Check if already logged in
user_button = page.locator('button:has-text("RoutineTracker2025")')
if not await user_button.count():
# Try to login
print("Not logged in, attempting to login...", file=sys.stderr)
# Click login link
await page.click('a:has-text("Log in")')
await page.wait_for_load_state("networkidle")
# Fill login form
await page.fill('input[name="_username"]', "RoutineTracker2025")
await page.fill('input[name="_password"]', "DailyRoutine123!")
# Submit login form
await page.click('button:has-text("Log in")')
await page.wait_for_load_state("networkidle")
# Check if login successful
user_button = page.locator('button:has-text("RoutineTracker2025")')
if not await user_button.count():
print("Error: Account login failed for RoutineTracker2025", file=sys.stderr)
return False
print("✓ Account login successful", file=sys.stderr)
else:
print("✓ Already logged in as RoutineTracker2025", file=sys.stderr)
# Step 2: Check if the post exists in LifeProTips forum with correct content
print("Step 2: Verifying post in LifeProTips forum...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/f/LifeProTips", wait_until="networkidle"
)
# Check for the created post
expected_title = "My 5-Step Morning Routine That Increased My Productivity by 200%"
post_link = page.locator(f'a:has-text("{expected_title}")')
if not await post_link.count():
print(f"Error: Post with title '{expected_title}' not found in LifeProTips forum", file=sys.stderr)
return False
# Click on the post to verify content
await post_link.click()
await page.wait_for_load_state("networkidle")
# Verify post content - this should be the content from the most upvoted comment of the calendar post
expected_content = "As a college student, having a visible reminder of the assignments I have and when they are due is super helpful for me. It also just feels good to erase them from the board once they are completed."
# Check if the content exists in the page
content_found = False
article_content = await page.locator("article").text_content()
if article_content and expected_content in article_content:
content_found = True
if not content_found:
print(f"Error: Post content does not match expected content", file=sys.stderr)
print(f"Expected: {expected_content}", file=sys.stderr)
return False
print("✓ Post found in LifeProTips with correct title and content", file=sys.stderr)
# Step 3: Check upvotes via search
print("Step 3: Verifying upvotes on posts...", file=sys.stderr)
# Check first post upvote
search_url1 = f"{BASE_URL}/search?q=LPT%3A+Use+your+calendar+as+your+to-do+list.+Assigning+dedicated+time+to+tasks+increases+the+likelyhood+of+you+acting+upon+it."
await page.goto(search_url1, wait_until="networkidle")
# Find the post and check its upvote count
posts = await page.locator("article").all()
calendar_upvoted = False
for post in posts:
title_elem = post.locator("h1 a")
if await title_elem.count():
title = await title_elem.text_content()
if "Use your calendar as your to-do list" in title:
# Check upvote count
vote_count_elem = post.locator("span.vote__net-score")
if await vote_count_elem.count():
vote_count = await vote_count_elem.text_content()
if vote_count and vote_count.strip() == "1":
calendar_upvoted = True
print("✓ Calendar post upvoted (count: 1)", file=sys.stderr)
break
if not calendar_upvoted:
print("Error: Calendar post not upvoted or upvote count is not 1", file=sys.stderr)
return False
# Check second post upvote
search_url2 = f"{BASE_URL}/search?q=LPT%3A+clean+your+stovetop+after+using+the+oven.+The+heat+loosens+grime+for+easy+removal"
await page.goto(search_url2, wait_until="networkidle")
posts = await page.locator("article").all()
stovetop_upvoted = False
for post in posts:
title_elem = post.locator("h1 a")
if await title_elem.count():
title = await title_elem.text_content()
if "clean your stovetop after using the oven" in title:
# Check upvote count
vote_count_elem = post.locator("span.vote__net-score")
if await vote_count_elem.count():
vote_count = await vote_count_elem.text_content()
if vote_count and vote_count.strip() == "1":
stovetop_upvoted = True
print("✓ Stovetop post upvoted (count: 1)", file=sys.stderr)
break
if not stovetop_upvoted:
print("Error: Stovetop post not upvoted or upvote count is not 1", file=sys.stderr)
return False
print("Success: All verification steps passed!")
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/description.md
================================================
**Task Requirements:**
1. Search for products with 'Ginger' in the Product Name field and price range $50.00 to $100.00
2. Add Q Mixers Premium Ginger Ale product to the comparison list
3. Find Intel NUC Kit product in Electronics category and add it to the comparison list
4. From the comparison page:
- Record SKU numbers for both products
- Add all products to cart
5. Record the total cart value
6. On the Ginger Ale product detail page, record:
- Number of customer reviews
- Name of the most recent reviewer (on top of the first page)
7. Output your findings in this format:
```
GingerAleSKU|sku
IntelNUCSKU|sku
CartTotal|amount
ReviewCount|count
LatestReviewer|name
```
**Example Output:**
```
GingerAleSKU|XXXXXXXXX
IntelNUCSKU|XXXXXXXXX
CartTotal|$XXX.XX
ReviewCount|XX
LatestReviewer|name
```
================================================
FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/label.txt
================================================
GingerAleSKU|B071KC37VD
IntelNUCSKU|B01DJ9XID4
CartTotal|$668.49
ReviewCount|12
LatestReviewer|jwm
================================================
FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/meta.json
================================================
{
"task_id": "advanced_product_analysis",
"task_name": "Advanced Product Analysis",
"category_id": "shopping",
"category_name": "Shopping",
"description": "Perform comprehensive product analysis including feature comparisons, price tracking, review aggregation, customer sentiment analysis, and generate detailed recommendation reports for informed purchasing decisions.",
"author": "Yaoqi Ye",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping/advanced_product_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the xxx format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 5:
print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key == "GingerAleSKU":
# Check exact SKU match
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "IntelNUCSKU":
# Check exact SKU match
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "CartTotal":
# For price fields, only support $XX.XX format
# Check if model value has correct format
if not model_value.startswith("$"):
mismatches.append(
f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
)
else:
# Normalize and compare values
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "ReviewCount":
# Check review count matches
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "LatestReviewer":
# Check reviewer name (allow partial match for names)
if expected_value.lower() not in model_value.lower() and model_value.lower() not in expected_value.lower():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the advanced product analysis task has been completed correctly.
First checks the model's answer against the expected label.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/description.md
================================================
**Task Requirements:**
1. In Video Games category, count products with customer rating 70% or higher in the first 2 pages
2. Sort products by price (ascending) and identify the cheapest product that has customer reviews
3. Find product with SKU 'B07D6LSCXZ' (N64 Controller), add to cart with quantity 3
4. Add products with SKU 'B071DR5V1K' and 'B082LZ4451' to comparison list, then count total products on comparison page
5. In cart, update N64 Controller quantity to 5 and record the subtotal for this item
6. Proceed to checkout and fill shipping form:
- Email: test.buyer@example.com
- First Name: Alice
- Last Name: Johnson
- Street Address: 456 Oak Avenue
- Country: United States
- State/Province: California
- City: San Francisco
- Zip Code: 94102
- Phone: 415-555-0123
Then count available shipping methods
7. Output your findings in this format:
```
Products70Plus|count
CheapestReviewedSKU|sku
CheapestReviewedPrice|price
ComparisonCount|count
N64Subtotal|amount
CheckoutEmail|test.buyer@example.com
ShippingState|California
ShippingMethods|count
```
================================================
FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/label.txt
================================================
Products70Plus|7
CheapestReviewedSKU|B014HDAUAA
CheapestReviewedPrice|$0.99
ComparisonCount|2
N64Subtotal|$84.95
CheckoutEmail|test.buyer@example.com
ShippingState|California
ShippingMethods|1
================================================
FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/meta.json
================================================
{
"task_id": "gaming_accessories_analysis",
"task_name": "Gaming Accessories Analysis",
"category_id": "shopping",
"category_name": "Shopping",
"description": "Research gaming peripherals by analyzing technical specifications, comparing performance metrics, evaluating user reviews, tracking price trends, and creating detailed gaming accessory recommendations.",
"author": "Yaoqi Ye",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"search aggregation",
"comparative analysis",
"data extraction"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping/gaming_accessories_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 8:
print(f"Error: Expected 8 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key in ["CheapestReviewedPrice", "N64Subtotal"]:
# For price fields, only support $XX.XX format
# Check if model value has correct format
if not model_value.startswith("$"):
mismatches.append(
f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
)
else:
# Normalize and compare values
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "CheckoutEmail":
# Email should match exactly (case-insensitive)
if model_value.lower() != expected_value.lower():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "Products70Plus":
# For count fields, allow some flexibility (products might change)
# But still check if it's a reasonable number
try:
model_count = int(model_value)
expected_count = int(expected_value)
# Allow up to 2 products difference (in case of dynamic content)
if abs(model_count - expected_count) > 2:
mismatches.append(
f"{key}: expected around '{expected_value}', got '{model_value}'"
)
except ValueError:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the gaming accessories analysis task has been completed correctly.
Checks the model's answer against the expected label.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/description.md
================================================
## Task Requirements
1. Search for products with `vitamin` in Description and price range `$0.00` to `$99.99`. Record total search results count.
2. In "Health & Household" category with price filter `$0.00 - $99.99`:
- Add "LOOPACELL AG13 LR44 L1154 357 76A A76 Button Cell Battery 10 Pack" to comparison
- Add "Energizer MAX C Batteries, Premium Alkaline C Cell Batteries (8 Battery Count)" to comparison
- Record each battery's price
- Verify comparison list has 2 items
3. Search `Elmwood Inn Fine Teas`, find "Elmwood Inn Fine Teas, Orange Vanilla Caffeine-free Fruit Infusion, 16-Ounce Pouch":
- Record SKU, rating percentage, and review count
- Add to cart with quantity 2
4. Search `energy`, sort by Relevance (descending):
- Find "V8 +Energy, Healthy Energy Drink, Steady Energy from Black and Green Tea, Pomegranate Blueberry, 8 Ounce Can ,Pack of 24"
- Record its position (1st, 2nd, 3rd, etc.)
- Add to cart with quantity 1
5. In cart:
- Record unique products count, total quantity, and subtotal
- Then update Elmwood tea quantity to 3
- Record new subtotal
## Output Format
```
AdvancedSearchResults|XXXX
Battery1Name|LOOPACELL AG13 LR44
Battery1Price|$X.XX
Battery2Name|Energizer MAX C
Battery2Price|$XX.XX
ComparisonCount|X
TeaSKU|XXXXXXXXXX
TeaRating|XXX%
TeaReviews|X
V8Position|Xth
CartUniqueProducts|X
CartTotalQuantity|X
InitialSubtotal|$XX.XX
FinalSubtotal|$XX.XX
```
================================================
FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/label.txt
================================================
AdvancedSearchResults|2906
Battery1Name|LOOPACELL AG13 LR44
Battery1Price|$3.72
Battery2Name|Energizer MAX C
Battery2Price|$14.87
ComparisonCount|2
TeaSKU|B0040WHKIY
TeaRating|95%
TeaReviews|4
V8Position|3rd
CartUniqueProducts|2
CartTotalQuantity|3
InitialSubtotal|$53.19
FinalSubtotal|$72.55
================================================
FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/meta.json
================================================
{
"task_id": "health_routine_optimization",
"task_name": "Health Routine Optimization",
"category_id": "shopping",
"category_name": "Shopping",
"description": "Optimize health and wellness product selections by analyzing nutritional supplements, fitness equipment, creating personalized routines, and tracking health metrics for lifestyle improvements.",
"author": "Yaoqi Ye",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping/health_routine_optimization/verify.py
================================================
import asyncio
import sys
import os
import json
import re
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 14:
print(f"Error: Expected 14 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
content = f.read().strip()
# Parse the answer from the label file
# The label file contains ... tags
match = re.search(r"(.*?)", content, re.IGNORECASE | re.DOTALL)
if match:
answer_content = match.group(1).strip()
lines = answer_content.split("\n")
else:
# Fallback: treat the whole file as answer content
lines = content.split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key in ["Battery1Price", "Battery2Price", "InitialSubtotal", "FinalSubtotal"]:
# For price fields, only support $XX.XX format
# Check if model value has correct format
if not model_value.startswith("$"):
mismatches.append(
f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
)
else:
# Normalize and compare values
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the health routine optimization task has been completed correctly.
Checks the model's answer against the expected label.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/description.md
================================================
**Task Requirements:**
1. Search 'gingerbread', sort by price (high to low):
- Add most expensive product to comparison list
- Record SKU of second most expensive product
2. Search 'cookie' with price range $20.00-$40.00:
- Find product with highest rating % and at least 5 reviews in the first 2 pages (if tied, choose lowest price)
- Record SKU and rating %
- Select "Cookies: Oatmeal Chocolate Chunk" flavor if required
- Add to cart with quantity 2
3. Search 'chocolate', sort by price (low to high):
- Find cheapest product with at least 1 review
- Record price and review count
- Select "Peanut Butter Flavor" if required
- Add to cart with quantity 3
4. In cart:
- Update cookie quantity from 2 to 5
- Record cart subtotal and total items count
5. Search 'gingerbread', go to page 2:
- Find third product on page 2
- Record SKU, price, and manufacturer
**Output Format:**
```
SecondGingerbreadSKU|sku
HighestRatedCookieSKURating|sku:rating%
CheapestChocolatePriceReviews|$price:reviews
CartSubtotalAfterUpdate|$amount
TotalCartItems|count
Page2ThirdProductSKUPrice|sku:$price
ProductManufacturer|manufacturer
```
================================================
FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/label.txt
================================================
SecondGingerbreadSKU|B0075AO9RI
HighestRatedCookieSKURating|B0951CPYV7:86%
CheapestChocolatePriceReviews|$1.04:12
CartSubtotalAfterUpdate|$128.07
TotalCartItems|8
Page2ThirdProductSKUPrice|B09RPXCB47:$21.99
ProductManufacturer|That Melanin Tho
================================================
FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/meta.json
================================================
{
"task_id": "holiday_baking_competition",
"task_name": "Holiday Baking Competition",
"category_id": "shopping",
"category_name": "Shopping",
"description": "Research baking supplies for competition preparation including ingredient quality analysis, equipment comparisons, recipe optimization, and creating comprehensive shopping list with budget recommendations.",
"author": "Yaoqi Ye",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"search aggregation",
"comparative analysis",
"inventory management"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping/holiday_baking_competition/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 7:
print(f"Error: Expected 7 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key == "SecondGingerbreadSKU":
# SKU should match exactly (case-insensitive)
if model_value.upper() != expected_value.upper():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key in ["CartSubtotalAfterUpdate"]:
# For price fields, only support $XX.XX format
# Check if model value has correct format
if not model_value.startswith("$"):
mismatches.append(
f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
)
else:
# Normalize and compare values
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
# Allow some tolerance for price calculations (within $0.01)
try:
expected_float = float(expected_clean)
model_float = float(model_clean)
if abs(expected_float - model_float) > 0.01:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
except ValueError:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key in ["TotalCartItems"]:
# Should be a number
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key in ["HighestRatedCookieSKURating", "CheapestChocolatePriceReviews", "Page2ThirdProductSKUPrice"]:
# Colon-separated fields (sku:rating, price:reviews, sku:price)
if ":" in expected_value and ":" in model_value:
expected_parts = expected_value.split(":", 1)
model_parts = model_value.split(":", 1)
if len(expected_parts) == 2 and len(model_parts) == 2:
# For price fields, normalize the price part
if key == "CheapestChocolatePriceReviews":
# Check if price part has correct format ($XX.XX)
if not model_parts[0].startswith("$"):
mismatches.append(
f"{key}: incorrect format - price part should start with '$', got '{model_value}'"
)
else:
expected_price = expected_parts[0].replace("$", "").replace(",", "")
model_price = model_parts[0].replace("$", "").replace(",", "")
try:
if abs(float(expected_price) - float(model_price)) > 0.01 or expected_parts[1] != model_parts[1]:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
except ValueError:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "Page2ThirdProductSKUPrice":
# Check if price part has correct format ($XX.XX)
if not model_parts[1].startswith("$"):
mismatches.append(
f"{key}: incorrect format - price part should start with '$', got '{model_value}'"
)
else:
expected_price = expected_parts[1].replace("$", "").replace(",", "")
model_price = model_parts[1].replace("$", "").replace(",", "")
try:
if expected_parts[0] != model_parts[0] or abs(float(expected_price) - float(model_price)) > 0.01:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
except ValueError:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# For rating fields, exact match
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields (like ProductManufacturer)
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the holiday baking competition task has been completed correctly.
Checks the model's answer against the expected label.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/description.md
================================================
Complete the following tasks on One Stop Market (http://localhost:7770):
**Task Requirements:**
1. In Chocolate subcategory, sort by price (ascending):
- Record price and SKU of first 3 products
2. Search for 'tabletop' with price range $100.00-$200.00:
- Find the cheapest tabletop that has the highest review rating with at least 3 reviews.
- Record search results count
- Record price of required tabletop
3. In "Computers & Accessories" subcategory with price filter $0.00-$9,999.99:
- Sort by price (ascending)
- Record price of cheapest item
4. Add these products to comparison:
- "Little Secrets Chocolate Pieces, Peanut Butter Flavor"
- "Multi Accessory Hub Adapter By JOBY"
- "SanDisk Cruzer Glide 32GB (5 Pack) USB 2.0 Flash Drive"
- Count total items on comparison page
5. In cart:
- Add the cheapest chocolate product (from step 1) with "Peanut flavor" if available
- Add cheapest computer accessory (from step 3)
- Record cart subtotal and item count
6. Calculate:
- Sum of 3 chocolate product prices
- Price difference: cheapest tabletop minus cheapest computer accessory
- Whether sum of 3 comparison items < $60
**Output Format:**
```
chocolate_products|Price1:SKU1;Price2:SKU2;Price3:SKU3
chocolate_sum|Total
tabletop_search_count|Count
tabletop_product|Price:SKU
tabletop_reviews|NumbersOfReviews:Rating
cheapest_computer_accessory|Price
price_difference|Amount
comparison_count|Count
cart_subtotal|Amount
cart_item_count|Count
under_60_budget|YES/NO
```
================================================
FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/label.txt
================================================
chocolate_products|$1.04:B071954ZDC;$1.89:B07ND598N3;$2.50:B01G26DMSC
chocolate_sum|$5.43
tabletop_search_count|60
tabletop_product|$169.99:B09NPX5CDP
tabletop_reviews|4:95%
cheapest_computer_accessory|$1.17
price_difference|$168.82
comparison_count|3
cart_subtotal|$2.21
cart_item_count|2
under_60_budget|YES
================================================
FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/meta.json
================================================
{
"task_id": "multi_category_budget_analysis",
"task_name": "Multi Category Budget Analysis",
"category_id": "shopping",
"category_name": "Shopping",
"description": "Analyze spending patterns across multiple product categories, optimize budget allocation, identify cost-saving opportunities, and generate comprehensive financial planning report with purchase recommendations.",
"author": "Yaoqi Ye",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"data extraction",
"search aggregation",
"content submission",
"comparative analysis",
"inventory management"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping/multi_category_budget_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 11:
print(f"Error: Expected 11 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key == "chocolate_products":
# Parse and compare chocolate products with price:SKU format
expected_products = expected_value.split(";")
model_products = model_value.split(";")
if len(expected_products) != len(model_products):
mismatches.append(f"{key}: expected {len(expected_products)} products, got {len(model_products)}")
else:
for i, (exp, mod) in enumerate(zip(expected_products, model_products)):
exp_parts = exp.strip().split(":")
mod_parts = mod.strip().split(":")
if len(exp_parts) != 2 or len(mod_parts) != 2:
mismatches.append(f"{key}: product {i+1} format error - expected 'price:SKU'")
else:
# Check price format (should start with $)
if not mod_parts[0].startswith("$"):
mismatches.append(f"{key}: product {i+1} price format error - expected '$XX.XX' format, got '{mod_parts[0]}'")
elif exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]:
mismatches.append(f"{key}: product {i+1} mismatch - expected '{exp}', got '{mod}'")
elif key == "tabletop_product":
# Parse and compare tabletop product with price:SKU format
exp_parts = expected_value.strip().split(":")
mod_parts = model_value.strip().split(":")
if len(exp_parts) != 2 or len(mod_parts) != 2:
mismatches.append(f"{key}: format error - expected 'price:SKU', got '{model_value}'")
else:
# Check price format (should start with $)
if not mod_parts[0].startswith("$"):
mismatches.append(f"{key}: price format error - expected '$XX.XX' format, got '{mod_parts[0]}'")
elif exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]:
mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'")
elif key == "tabletop_reviews":
# Parse and compare tabletop reviews with NumberOfReviews:Rating format
exp_parts = expected_value.strip().split(":")
mod_parts = model_value.strip().split(":")
if len(exp_parts) != 2 or len(mod_parts) != 2:
mismatches.append(f"{key}: format error - expected 'NumberOfReviews:Rating', got '{model_value}'")
else:
# Check if both parts match
if exp_parts[0] != mod_parts[0] or exp_parts[1] != mod_parts[1]:
mismatches.append(f"{key}: mismatch - expected '{expected_value}', got '{model_value}'")
elif key in ["chocolate_sum", "price_difference", "cart_subtotal", "cheapest_computer_accessory"]:
# For price fields, only support $XX.XX format
# Check if model value has correct format
if not model_value.startswith("$"):
mismatches.append(
f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
)
else:
# Normalize and compare values
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "under_60_budget":
# Compare YES/NO value (case-insensitive)
if expected_value.upper() != model_value.upper():
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key in ["tabletop_search_count", "comparison_count", "cart_item_count"]:
# Numeric fields - exact match
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the multi-category budget analysis task has been completed correctly.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print("Warning: Could not parse answer format from model response", file=sys.stderr)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/description.md
================================================
1. Search for a `printer capable of reducing blue light` that:
- Is pink or purple (must be stated in product details, not from image)
- Manufactured in Asia
Record SKU ID and price
2. Find a keyboard with:
- Bluetooth mode (must be stated either stated in details or title)
- Price between $50.00-$100.00
- Highest review rating among matching products
Record SKU ID, price, number of reviews, and review rating
**Output Format:**
```
PrinterSKUID|id
PrinterPrice|$XX.XX
KeyboardSKUID|id
KeyboardPrice|$XX.XX
KeyboardReviews|XX
KeyboardRating|XX%
```
================================================
FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/label.txt
================================================
PrinterSKUID|B09J8KQX6V
PrinterPrice|$248.04
KeyboardSKUID|B08JD7F3F5
KeyboardPrice|$85.99
KeyboardReviews|12
KeyboardRating|77%
================================================
FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/meta.json
================================================
{
"task_id": "printer_keyboard_search",
"task_name": "Printer Keyboard Search",
"category_id": "shopping",
"category_name": "Shopping",
"description": "Search and evaluate office equipment by comparing printer specifications, keyboard ergonomics, analyzing user reviews, tracking prices, and generating detailed purchase recommendations report.",
"author": "Yaoqi Ye",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"search aggregation",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping/printer_keyboard_search/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 6:
print(f"Error: Expected 6 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key in ["PrinterPrice", "KeyboardPrice"]:
# For price fields, only support $XX.XX format
# Check if model value has correct format
if not model_value.startswith("$"):
mismatches.append(
f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
)
else:
# Normalize and compare values
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key in ["PrinterSKUID", "KeyboardSKUID"]:
# SKU should match exactly (case-insensitive)
if model_value.upper() != expected_value.upper():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "KeyboardReviews":
# Number of reviews should match exactly
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "KeyboardRating":
# Rating should match exactly (including % sign)
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the electronic products task has been completed correctly.
Checks the model's answer against the expected label.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/description.md
================================================
1. Find running shoes:
- Price between $50.00-$60.00
- "running shoe" must appear in product name
- Choose the one with highest number of reviews
- Select black or white color, size 10
- Add to cart with quantity 2
2. Record from product page: SKU ID, price, number of reviews, review rating
3. Record cart subtotal
**Output Format:**
```
SKUID|id
Price|$XX.XX
NumberOfReviews|XX
ReviewRating|XX%
Subtotal|$XX.XX
```
================================================
FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/label.txt
================================================
SKUID|B08KKX1WXQ
Price|$56.21
NumberOfReviews|46
ReviewRating|86%
Subtotal|$112.42
================================================
FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/meta.json
================================================
{
"task_id": "running_shoes_purchase",
"task_name": "Running Shoes Purchase",
"category_id": "shopping",
"category_name": "Shopping",
"description": "Research running footwear by analyzing biomechanical features, comparing cushioning technologies, evaluating durability ratings, considering user preferences, and recommending optimal shoe selections.",
"author": "Yaoqi Ye",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"search aggregation",
"comparative analysis"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/one-stop-market.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping/running_shoes_purchase/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 5:
print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key in ["Price", "Subtotal"]:
# For price fields, only support $XX.XX format
# Check if model value has correct format
if not model_value.startswith("$"):
mismatches.append(
f"{key}: incorrect format - expected '$XX.XX' format, got '{model_value}'"
)
else:
# Normalize and compare values
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
# Allow small tolerance for price calculations (within $0.01)
try:
expected_float = float(expected_clean)
model_float = float(model_clean)
if abs(expected_float - model_float) > 0.01:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
except ValueError:
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "SKUID":
# SKU should match exactly (case-insensitive)
if model_value.upper() != expected_value.upper():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "NumberOfReviews":
# Number of reviews should match exactly
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "ReviewRating":
# Rating should match exactly (including % sign)
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the running shoes shopping task has been completed correctly.
Checks the model's answer against the expected label.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/description.md
================================================
Perform customer segmentation setup and analysis in the Magento Admin panel to establish new customer groups and manage customer profiles.
**Task Requirements:**
1. Access the Magento Admin panel to begin customer segmentation setup. if need to login, login with username 'admin' and password 'admin1234'
2. Establish baseline metrics for customer groups:
- Record the exact number shown in "records found" at the top of the grid
- This will be your initial groups count
3. Create a specialized customer group for European premium customers:
- Group Name: Premium Europe
- Tax Class: Retail Customer
- Save the group
4. Verify the customer group creation was successful:
- After saving, return to Customer Groups list
- Record the new total shown in "records found"
5. Establish baseline metrics for all customers database:
- Record the exact number shown in "records found" at the top of the grid
- This will be your initial customers count
6. Add a representative customer to the new premium group:
- Create a new customer with the following details:
- First Name: Isabella
- Last Name: Romano
- Email: isabella.romano@premium.eu
- Associate to Website: Main Website
- Group: The group you just created
- Save the customer
7. Verify the customer creation was successful:
- After saving, return to All Customers list
- Record the new total shown in "records found"
8. Analyze recent customer activity patterns:
- Navigate to Dashboard
- Look at the "Last Orders" section
- Record the customer name in the last row of the table
9. Compile all your findings and output them in the following exact format:
```
InitialGroups|count
FinalGroups|count
InitialCustomers|count
FinalCustomers|count
LastOrderCustomer|name
```
**Example Output:**
```
InitialGroups|XX
FinalGroups|XX
InitialCustomers|XXX
FinalCustomers|XXX
LastOrderCustomer|XXX
```
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/label.txt
================================================
InitialGroups|4
FinalGroups|5
InitialCustomers|70
FinalCustomers|71
LastOrderCustomer|Ava Brown
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/meta.json
================================================
{
"task_id": "customer_segmentation_setup",
"task_name": "Customer Segmentation Setup",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Configure customer segmentation system in admin panel by defining demographic criteria, creating behavior-based segments, implementing targeting rules, and setting up automated marketing workflows.",
"author": "Fanqing Meng",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"content submission",
"inventory management"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/customer_segmentation_setup/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
# 从环境变量读取 base_url(shopping_admin 会注入 http://localhost:7780/admin),默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:7780/admin").rstrip("/")
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 5:
print(f"Error: Expected 5 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Exact match for all fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the customer segmentation setup task has been completed correctly.
First checks the model's answer against the expected label,
then verifies the actual state in the Magento Admin.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
print("Will proceed with browser verification only", file=sys.stderr)
else:
print(
"No model response found, proceeding with browser verification",
file=sys.stderr,
)
# Browser verification for actual state
print("\n=== Starting Browser Verification ===", file=sys.stderr)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Navigate to Magento Admin
print("Navigating to Magento Admin...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/", wait_until="networkidle"
)
# Check if already logged in, if not, login
if "dashboard" not in page.url.lower():
print("Logging into Magento Admin...", file=sys.stderr)
await page.fill('input[name="login[username]"]', "admin")
await page.fill('input[name="login[password]"]', "admin1234")
await page.click('button:has-text("Sign in")')
await page.wait_for_load_state("networkidle")
if "dashboard" not in page.url.lower():
print("Error: Login failed", file=sys.stderr)
return False
print("Successfully logged into Magento Admin", file=sys.stderr)
# 1. Verify Customer Groups
print("\nVerifying Customer Groups...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/customer/group/",
wait_until="networkidle",
)
await page.wait_for_timeout(2000) # Wait for grid to load
# Check for Premium Europe group
premium_europe_exists = (
await page.locator("text=Premium Europe").count() > 0
)
if premium_europe_exists:
print("✓ Found 'Premium Europe' customer group", file=sys.stderr)
# Check if it has Retail Customer tax class
# Look for Premium Europe row and check its tax class
premium_row = page.locator('tr:has-text("Premium Europe")')
if await premium_row.count() > 0:
tax_class_text = await premium_row.locator("td").nth(2).inner_text()
if "Retail Customer" in tax_class_text:
print(
"✓ Premium Europe has 'Retail Customer' tax class",
file=sys.stderr,
)
else:
print(
f"Warning: Premium Europe tax class is '{tax_class_text}'",
file=sys.stderr,
)
else:
print("✗ 'Premium Europe' customer group not found", file=sys.stderr)
return False
# Check total groups count
records_found = page.locator("text=records found").first
if await records_found.count() > 0:
count_text = await records_found.inner_text()
print(f"Customer Groups count: {count_text}", file=sys.stderr)
# Extract number
import re
match = re.search(r"(\d+)\s+records found", count_text)
if match:
groups_count = int(match.group(1))
print(f"✓ Customer groups count is {groups_count}", file=sys.stderr)
# 2. Verify Customer
print("\nVerifying Customer Isabella Romano...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/customer/index/",
wait_until="networkidle",
)
await page.wait_for_timeout(3000) # Wait for grid to load
# Check total customers count
customer_records = page.locator("text=records found").first
if await customer_records.count() > 0:
count_text = await customer_records.inner_text()
print(f"Customers count: {count_text}", file=sys.stderr)
# Extract number
match = re.search(r"(\d+)\s+records found", count_text)
if match:
customers_count = int(match.group(1))
print(
f"✓ Total customers count is {customers_count}", file=sys.stderr
)
# Verify against expected answer if available
if expected_answer and "FinalCustomers" in expected_answer:
expected_final = int(expected_answer["FinalCustomers"])
if customers_count == expected_final:
print(
f"✓ Customer count matches expected: {customers_count}",
file=sys.stderr,
)
else:
print(
f"✗ Customer count mismatch: Expected {expected_final} customers, found {customers_count}",
file=sys.stderr,
)
return False
# Wait for the customer grid to load properly
await page.wait_for_timeout(5000)
# Check if Isabella Romano exists - first wait for grid to load
grid_loaded = False
for i in range(3):
# Look for grid container and wait for it to populate
grid_container = page.locator(".admin__data-grid-outer-wrap, .data-grid, table").first
if await grid_container.count() > 0:
# Check if there are customer rows loaded
customer_rows = page.locator("td[data-column='email'], td:has-text('@')")
if await customer_rows.count() > 0:
grid_loaded = True
break
await page.wait_for_timeout(2000)
if not grid_loaded:
print("✗ Customer grid failed to load properly", file=sys.stderr)
return False
# Now check if Isabella Romano exists in the loaded grid
isabella_exists = (
await page.locator("text=isabella.romano@premium.eu").count() > 0
)
if not isabella_exists:
# Try searching for the customer to be more thorough
try:
search_box = page.locator('input[placeholder*="Search by keyword"], input[name="search"], [data-role="search"]').first
if await search_box.count() > 0:
await search_box.clear()
await search_box.fill("isabella.romano@premium.eu")
await page.keyboard.press("Enter")
await page.wait_for_load_state("networkidle")
await page.wait_for_timeout(3000)
# Check again after search
isabella_exists = (
await page.locator("text=isabella.romano@premium.eu").count() > 0
)
# Also check for "No records found" message
no_records = await page.locator("text=We couldn't find any records., text=No records found").count() > 0
if no_records:
print(
"✗ Customer 'isabella.romano@premium.eu' not found - search returned no results",
file=sys.stderr,
)
return False
except Exception as e:
print(f"✗ Search failed: {str(e)}", file=sys.stderr)
if isabella_exists:
print(
"✓ Found customer with email 'isabella.romano@premium.eu'",
file=sys.stderr,
)
else:
print(
"✗ Customer 'isabella.romano@premium.eu' not found",
file=sys.stderr,
)
return False
# 3. Verify Dashboard Last Orders
print("\nVerifying Dashboard Last Orders...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/admin/dashboard/",
wait_until="networkidle",
)
await page.wait_for_timeout(2000)
# Check for Last Orders section
last_orders_exists = await page.locator("text=Last Orders").count() > 0
if last_orders_exists:
print("✓ Found 'Last Orders' section on dashboard", file=sys.stderr)
# Find the first customer in the table
# Look for the table after "Last Orders" heading
orders_table = (
page.locator("text=Last Orders")
.locator("..")
.locator("table")
.first
)
if await orders_table.count() > 0:
# Get the last row in tbody
last_row = orders_table.locator("tbody tr").last
if await last_row.count() > 0:
last_customer = await last_row.locator(
"td"
).first.inner_text()
print(
f"✓ Last customer in Last Orders: {last_customer}",
file=sys.stderr,
)
# Verify against expected answer if available
if expected_answer and "LastOrderCustomer" in expected_answer:
if last_customer == expected_answer["LastOrderCustomer"]:
print(
f"✓ Last Order Customer matches expected: {last_customer}",
file=sys.stderr,
)
else:
print(
f"✗ Last Order Customer mismatch: Expected '{expected_answer['LastOrderCustomer']}' but actual is '{last_customer}'",
file=sys.stderr,
)
return False
else:
print(
"Warning: 'Last Orders' section not found on dashboard",
file=sys.stderr,
)
# Summary of verification - only print if we reach this point (all checks passed)
print("\n=== Browser Verification Summary ===", file=sys.stderr)
print("✓ Magento Admin login successful", file=sys.stderr)
print(
"✓ Customer group 'Premium Europe' exists with correct tax class",
file=sys.stderr,
)
print("✓ Customer 'isabella.romano@premium.eu' found in system", file=sys.stderr)
print("✓ Customer counts verified", file=sys.stderr)
print("✓ Dashboard Last Orders section accessible", file=sys.stderr)
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/description.md
================================================
Our marketing team is planning a new promotion for our bestselling fitness products. We need to analyze the current performance of our top-selling items and their related promotions to optimize our strategy.
**Task Requirements:**
1. If need to login, login with username 'admin' and password 'admin1234'
2. Start by checking our current bestsellers:
- Identify the top 3 bestselling products based on their Price and Quantity - record their names, prices, and quantities sold
- Note the total Revenue amount displayed
- Check if any of these bestsellers appear in the Top Search Terms table - if yes, record the search term and its usage count, else output 'No:0'
3. Investigate these bestselling products in detail:
- For each of the top 3 bestsellers identified, search for them by name and record:
- Their SKU
- Current inventory quantity
- Whether they are 'Enabled' or 'Disabled'
4. Check if we have existing promotions for these products:
- Look for any active rules that might apply to fitness/yoga products
- Find if there's a rule offering percentage discount - record the rule name and discount percentage
- Count total number of active rules
5. Analyze customer purchasing patterns:
- Count total number of orders in the system
- Note the ID of the most recent order
6. Review our top customers who might be interested:
- Find the customer who appears in the Last Orders section of the dashboard with the highest total
- Look up this customer in the All Customers list and record his email and customer group
- Count how many other customers are in the same group
7. Compile your findings and output them in the following exact format:
```
Bestseller1|name:price:quantity:sku:inventory:status
Bestseller2|name:price:quantity:sku:inventory:status
Bestseller3|name:price:quantity:sku:inventory:status
TotalRevenue|amount
BestsellerInSearch|term:count
PercentageDiscountRule|name:percentage
ActiveRulesCount|count
TotalOrders|count
MostRecentOrderID|id
TopCustomer|name:email:group
SameGroupCustomers|count
```
**Example Output:**
```
Bestseller1|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled
Bestseller2|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled
Bestseller3|Product Name:$XX.XX:X:XXX(SKU):X:Enabled/Disabled
TotalRevenue|$XX.XX
BestsellerInSearch|Term:X or None:0
PercentageDiscountRule|Rule Name:XX%
ActiveRulesCount|X
TotalOrders|X
MostRecentOrderID|X or None
TopCustomer|Customer Name:email@example.com:Group Name
SameGroupCustomers|X
```
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/label.txt
================================================
Bestseller1|Sprite Stasis Ball 65 cm:$27.00:6:24-WG082-blue:100:Enabled
Bestseller2|Quest Lumaflex™ Band:$19.00:6:24-UG01:100:Enabled
Bestseller3|Sprite Yoga Strap 6 foot:$14.00:6:24-WG085:100:Enabled
TotalRevenue|$0.00
BestsellerInSearch|No:0
PercentageDiscountRule|20% OFF Ever $200-plus purchase!*:20%
ActiveRulesCount|4
TotalOrders|308
MostRecentOrderID|000000299
TopCustomer|Sarah Miller:sarah.miller@example.com:General
SameGroupCustomers|70
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/meta.json
================================================
{
"task_id": "fitness_promotion_strategy",
"task_name": "Fitness Promotion Strategy",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Develop fitness product promotion campaigns by analyzing sales data, creating targeted offers, configuring promotional rules, and implementing cross-selling strategies in admin dashboard.",
"author": "Fanqing Meng",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"data extraction",
"comparative analysis",
"inventory management",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/fitness_promotion_strategy/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, 'r') as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if message.get('role') == 'assistant' and message.get('status') == 'completed':
content = message.get('content', [])
for item in content:
if item.get('type') == 'output_text':
return item.get('text', '')
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r'(.*?)', text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split('\n')
# Skip the check for exact number of lines - just parse what we have
# if len(lines) != 13:
# print(f"Error: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr)
# return None
for line in lines:
if '|' in line:
key, value = line.split('|', 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, 'r') as f:
lines = f.read().strip().split('\n')
expected = {}
for line in lines:
if '|' in line:
key, value = line.split('|', 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, '')
# Special handling for different types of values
if key in ['Bestseller1', 'Bestseller2', 'Bestseller3']:
# Check if all parts match (name:price:quantity:sku:inventory:status)
if ':' in expected_value and ':' in model_value:
expected_parts = expected_value.split(':')
model_parts = model_value.split(':')
if len(expected_parts) == 6 and len(model_parts) == 6:
# Compare each part
for i, (exp, mod) in enumerate(zip(expected_parts, model_parts)):
if i == 1: # Price field
exp_clean = exp.replace('$', '').replace(',', '')
mod_clean = mod.replace('$', '').replace(',', '')
if exp_clean != mod_clean:
mismatches.append(f"{key} price: expected '{exp}', got '{mod}'")
elif i == 4: # Inventory field (may have decimal places)
exp_float = float(exp.replace(',', ''))
mod_float = float(mod.replace(',', ''))
if abs(exp_float - mod_float) > 0.0001:
mismatches.append(f"{key} inventory: expected '{exp}', got '{mod}'")
else:
if exp.lower() != mod.lower():
mismatches.append(f"{key} part {i}: expected '{exp}', got '{mod}'")
else:
mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
else:
if expected_value != model_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'LowestInventoryProduct':
# Check product name and inventory
if ':' in expected_value and ':' in model_value:
expected_name, expected_inv = expected_value.rsplit(':', 1)
model_name, model_inv = model_value.rsplit(':', 1)
if expected_name.lower() != model_name.lower():
mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
exp_float = float(expected_inv.replace(',', ''))
mod_float = float(model_inv.replace(',', ''))
if abs(exp_float - mod_float) > 0.0001:
mismatches.append(f"{key} inventory: expected '{expected_inv}', got '{model_inv}'")
else:
if expected_value != model_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key in ['TotalRevenue', 'MinimumPurchaseRule']:
# For price/amount fields, normalize format
expected_clean = expected_value.replace('$', '').replace(',', '')
model_clean = model_value.replace('$', '').replace(',', '')
if expected_clean != model_clean:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'BestsellerInSearch':
# Check search term and count
if expected_value.lower() != model_value.lower():
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'PercentageDiscountRule':
# Check rule name and percentage
if ':' in expected_value and ':' in model_value:
expected_name, expected_pct = expected_value.rsplit(':', 1)
model_name, model_pct = model_value.rsplit(':', 1)
if expected_name != model_name:
mismatches.append(f"{key} name: expected '{expected_name}', got '{model_name}'")
# Normalize percentage (20% vs 20 vs 0.20)
exp_pct_clean = expected_pct.replace('%', '').strip()
mod_pct_clean = model_pct.replace('%', '').strip()
if exp_pct_clean != mod_pct_clean:
mismatches.append(f"{key} percentage: expected '{expected_pct}', got '{model_pct}'")
else:
if expected_value != model_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'TopCustomer':
# Check name:email:group
if ':' in expected_value and ':' in model_value:
expected_parts = expected_value.split(':')
model_parts = model_value.split(':')
if len(expected_parts) == 3 and len(model_parts) == 3:
exp_name, exp_email, exp_group = expected_parts
mod_name, mod_email, mod_group = model_parts
if exp_name != mod_name:
mismatches.append(f"{key} name: expected '{exp_name}', got '{mod_name}'")
if exp_email.lower() != mod_email.lower():
mismatches.append(f"{key} email: expected '{exp_email}', got '{mod_email}'")
if exp_group.lower() != mod_group.lower():
mismatches.append(f"{key} group: expected '{exp_group}', got '{mod_group}'")
else:
mismatches.append(f"{key}: format mismatch - expected '{expected_value}', got '{model_value}'")
else:
if expected_value != model_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'MostRecentOrderDate':
# Date format may vary, do flexible comparison
if expected_value.lower() == 'none' and model_value.lower() == 'none':
continue
elif expected_value != model_value:
# Could add more flexible date parsing here if needed
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
else:
# Exact match for other fields (counts, etc.)
if str(model_value) != str(expected_value):
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the bestseller analysis and promotion task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print("Warning: Could not parse answer format from model response", file=sys.stderr)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/description.md
================================================
Perform a comprehensive marketing and customer analysis workflow in the Magento Admin panel to understand search behavior patterns and promotional effectiveness.
**Task Requirements:**
1. First, we need to access the system to begin our comprehensive analysis:
if need to login, login with username 'admin' and password 'admin1234'
2. Let's start by analyzing customer search behavior to understand what customers are looking for:
Go to Search Terms in Reports and analyze the search data:
- Identify the TOP 2 search terms with the highest number of hits (record exact terms and hit counts)
- Find a search term that has 0 results but still has search hits (record exact term and hit count)
- Count the total number of search terms displayed in the report
3. Next, we'll examine our promotional strategies to understand current marketing efforts:
Navigate to Cart Price Rules and identify:
- Find ALL rules that contain a coupon code
- Record the exact coupon codes and the complete rule names for each
- Count how many active rules exist in total
4. Now let's analyze our email marketing reach and subscriber engagement:
Go to Newsletter Subscribers:
- Apply filter to show only 'Subscribed' status
- Count the total number of subscribed users showing after filter
- Verify whether these TWO emails appear in the subscribed list:
* john.smith.xyz@gmail.com
* admin@magento.com
5. To support our analysis, we need to create test customer profiles for different segments:
Create TWO new customers with the following details:
Customer 1:
- First Name: Marketing1
- Last Name: Analy
- Email: marketdata1.analysis@magento.com
- Associate to Website: Main Website
- Group: General
Customer 2:
- First Name: Analytics1
- Last Name: Report
- Email: analytics1.report@magento.com
- Associate to Website: Main Website
- Group: Wholesale
6. Finally, let's review overall business performance metrics from the main dashboard:
Go to Dashboard and identify:
- The names and sales quantities of the products that are both the best-selling and most expensive
- The total revenue displayed on the dashboard
7. Compile all your findings and must output them in the following exact format at last:
```
Top2SearchTerms|term1:hits1,term2:hits2
ZeroResultTerm|term:hits
TotalSearchTerms|count
CouponCodes|code1:rulename1,code2:rulename2
ActiveRulesCount|count
SubscribedCount|count
EmailVerification|john.smith.xyz@gmail.com:yes/no,admin@magento.com:yes/no
TopProduct|name:quantity
TotalRevenue|amount
```
**Example Output:**
```
Top2SearchTerms|term1:XX,term2:XX
ZeroResultTerm|term:XX
TotalSearchTerms|XX
CouponCodes|CODE:Rule Name Here
ActiveRulesCount|X
SubscribedCount|XX
EmailVerification|john.smith.xyz@gmail.com:yes/no,admin@magento.com:yes/no
TopProduct|Product Name:XX
TotalRevenue|$XX.XX
```
**Success Criteria:**
- Successfully logged into Magento Admin
- Navigated to Search Terms Report and identified top 2 terms
- Found search term with 0 results but has hits
- Counted total search terms in report
- Located all Cart Price Rules with coupon codes
- Extracted exact coupon codes and rule names
- Counted active rules
- Filtered Newsletter Subscribers by 'Subscribed' status
- Counted total subscribed users
- Verified presence of two specific email addresses
- Created two new customers successfully
- Found top bestselling product from dashboard
- Identified total revenue from dashboard
- Output answer in exact format with 9 data lines
- Answer wrapped in tags
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/label.txt
================================================
Top2SearchTerms|hollister:19,Joust Bag:4
ZeroResultTerm|nike:3
TotalSearchTerms|7
CouponCodes|H20:$4 Luma water bottle (save 70%)
ActiveRulesCount|4
SubscribedCount|1
EmailVerification|john.smith.xyz@gmail.com:yes,admin@magento.com:no
TopProduct|Sprite Stasis Ball 65 cm:6
TotalRevenue|$0.00
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/meta.json
================================================
{
"task_id": "marketing_customer_analysis",
"task_name": "Marketing Customer Analysis",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Analyze customer behavior patterns using admin analytics, segment user demographics, track purchase histories, evaluate campaign effectiveness, and generate comprehensive marketing intelligence reports.",
"author": "Fanqing Meng",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/marketing_customer_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
from playwright.async_api import (
async_playwright,
TimeoutError as PlaywrightTimeoutError,
)
# 从环境变量读取 base_url(shopping_admin 会注入 http://localhost:7780/admin),默认回退到本地
BASE_URL = os.getenv("WEBARENA_BASE_URL", "http://localhost:7780/admin").rstrip("/")
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the new multi-line xxx format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
return None
answer_content = match.group(1).strip()
# Parse each line
result = {}
lines = answer_content.split("\n")
if len(lines) != 9:
print(f"Error: Expected 9 lines in answer, got {len(lines)}", file=sys.stderr)
return None
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
result[key.strip()] = value.strip()
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key == "Top2SearchTerms":
# Check if both search terms are present with correct counts
expected_terms = expected_value.split(",")
model_terms = model_value.split(",")
if set(expected_terms) != set(model_terms):
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "EmailVerification":
# Check email verification status
expected_emails = dict(
item.split(":") for item in expected_value.split(",")
)
model_emails = dict(
item.split(":") for item in model_value.split(",") if ":" in item
)
if expected_emails != model_emails:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "CouponCodes":
# Check if coupon code and rule name are present
if "H20" not in model_value or "Luma water bottle" not in model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "TopProduct":
# Check if product name and quantity match
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the marketing analysis task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
print("Will proceed with browser verification only", file=sys.stderr)
else:
print(
"No model response found, proceeding with browser verification",
file=sys.stderr,
)
# Browser verification - only check customer creation (the critical task requirement)
print("\n=== Starting Browser Verification ===", file=sys.stderr)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
try:
# Navigate to Magento Admin
print("Navigating to Magento Admin...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/", wait_until="networkidle"
)
# Check if already logged in, if not, login
if "dashboard" not in page.url.lower():
print("Logging into Magento Admin...", file=sys.stderr)
await page.fill('input[name="login[username]"]', "admin")
await page.fill('input[name="login[password]"]', "admin1234")
await page.click('button:has-text("Sign in")')
await page.wait_for_load_state("networkidle")
if "dashboard" not in page.url.lower():
print("Error: Login failed", file=sys.stderr)
return False
print("Successfully logged into Magento Admin", file=sys.stderr)
# Verify Customer Creation (the only critical check for task completion)
print("Verifying Customer Creation...", file=sys.stderr)
await page.goto(
f"{BASE_URL}/customer/index/",
wait_until="networkidle",
)
# Wait for the customer grid to load
try:
await page.wait_for_selector("table", timeout=15000)
except PlaywrightTimeoutError:
print("Table not found, trying to proceed anyway...", file=sys.stderr)
# Define customer requirements
customer1_requirements = {
"email": "marketdata1.analysis@magento.com",
"first_name": "Marketing1",
"last_name": "Analy",
"group": "General",
"website": "Main Website"
}
customer2_requirements = {
"email": "analytics1.report@magento.com",
"first_name": "Analytics1",
"last_name": "Report",
"group": "Wholesale",
"website": "Main Website"
}
async def check_customer_exists(customer_requirements):
"""Check if a customer exists by looking for their details in the customer grid"""
email = customer_requirements["email"]
first_name = customer_requirements["first_name"]
last_name = customer_requirements["last_name"]
group = customer_requirements["group"]
# First check if email exists in current page without searching
email_found = await page.locator(f"*:has-text('{email}')").count() > 0
if not email_found:
# Try searching for the customer
try:
search_box = page.locator('input[placeholder*="Search by keyword"]').first
await search_box.clear()
await search_box.fill(email)
await page.keyboard.press("Enter")
await page.wait_for_load_state("networkidle")
await page.wait_for_timeout(2000)
# Check again after search
email_found = await page.locator(f"*:has-text('{email}')").count() > 0
except:
pass
if not email_found:
return False, f"Email {email} not found"
# More precise validation: find the row containing this customer's email
# Then check if the required fields are in the same row or nearby context
try:
# Find the specific row containing this email
email_cell = page.locator(f"td:has-text('{email}')").first
if await email_cell.count() == 0:
# Fall back to broader search
email_cell = page.locator(f"*:has-text('{email}')").first
# Get the parent row or container
row = email_cell.locator("xpath=ancestor::tr[1]")
if await row.count() == 0:
# Fall back to getting nearby content
row = email_cell.locator("xpath=..")
# Get the text content of the row/container
row_text = await row.text_content() if await row.count() > 0 else ""
# If we can't get a specific row, fall back to broader validation
if not row_text or len(row_text.strip()) < 10:
# Search in nearby cells or elements
nearby_elements = page.locator(f"*:has-text('{email}')").locator("xpath=../following-sibling::* | xpath=../preceding-sibling::*")
nearby_count = await nearby_elements.count()
nearby_text = ""
for i in range(min(nearby_count, 5)): # Check up to 5 nearby elements
element_text = await nearby_elements.nth(i).text_content()
if element_text:
nearby_text += element_text + " "
row_text = row_text + " " + nearby_text
# Check if required fields are present in the row/context
required_fields = [first_name, last_name, group]
found_fields = [email] # Email is already confirmed
missing_fields = []
for field in required_fields:
if field in row_text:
found_fields.append(field)
else:
missing_fields.append(field)
if missing_fields:
return False, f"Customer found but missing fields in row context: {', '.join(missing_fields)}. Row text: {row_text[:100]}..."
return True, f"Customer verified with all required fields: {', '.join(found_fields)}"
except Exception as e:
# Fall back to original simple validation
page_content = await page.content()
required_fields = [first_name, last_name, group, email]
found_fields = []
missing_fields = []
for field in required_fields:
if field in page_content:
found_fields.append(field)
else:
missing_fields.append(field)
if missing_fields:
return False, f"Customer found but missing fields (fallback): {', '.join(missing_fields)}"
return True, f"Customer verified with all required fields (fallback): {', '.join(found_fields)}"
# Check both customers
customer1_exists, customer1_msg = await check_customer_exists(customer1_requirements)
customer2_exists, customer2_msg = await check_customer_exists(customer2_requirements)
print(
f"Customer 1 (marketdata1.analysis@magento.com): {'Found' if customer1_exists else 'Not Found'} - {customer1_msg}",
file=sys.stderr,
)
print(
f"Customer 2 (analytics1.report@magento.com): {'Found' if customer2_exists else 'Not Found'} - {customer2_msg}",
file=sys.stderr,
)
if not (customer1_exists and customer2_exists):
print("Error: Required customers were not found in the system", file=sys.stderr)
return False
print("✓ Both required customers found in the system", file=sys.stderr)
return True
except PlaywrightTimeoutError as e:
print(f"Error: Timeout occurred - {str(e)}", file=sys.stderr)
return False
except Exception as e:
print(f"Error: Unexpected error - {str(e)}", file=sys.stderr)
return False
finally:
await browser.close()
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/description.md
================================================
Our company is planning to expand sales operations to New York state and needs a comprehensive analysis of our current sales performance and tax implications. Please help me gather critical data for our expansion feasibility report.
**Task Requirements:**
1. Log in with username 'admin' and password 'admin1234'
2. First, analyze our current sales performance on the dashboard:
- Check the 'Lifetime Sales' amount displayed
- In the Bestsellers table, identify which product has lowest price and record its exact name, price, and quantity sold
- Find if this same product appears in the 'Last Orders' table, and if so, note which customer(s) ordered it, if no, note 'No'
3. Since we're expanding to New York, we need check tax:
- Find and record the exact tax rate for New York state
- Compare it with California's tax rate - record which state has a higher rate
- Count how many different US states currently have tax configurations
4. You need to understand our order status of stores processing for the NY market:
- Filter orders to show only statuses that are 'Visible On Storefront = Yes'
- Among these visible statuses, identify if exists one has the status code 'processing' (Yes or No),
- Check if this 'processing' status is set as a 'Default Status' (Yes or No)
5. Since New York orders might need special handling, check all stores:
- Note the number of website configured
- Record the store code for the first Main Website Store
6. For inventory planning, check the sources of it:
- Check if the Default Source is currently 'Enabled' or shows as 'Disabled' for Pickup Location
- Click the 'Edit' link for the Default Source and check if there's a 'State/Province' field (Yes or No)
7. Finally, return to the Dashboard and examine the revenue metrics:
- Record the current Revenue amount shown
- Check if Tax and Shipping amounts are both $0.00 (Yes or No)
**Please provide your findings in the following exact format:**
```
Lifetime_Sales_Amount|amount
Cheap_Bestseller_Name|name
Second_Bestseller_Price|price
Second_Bestseller_Quantity|quantity
Product_In_Last_Orders|yes_or_no
NY_Tax_Rate|rate
CA_Tax_Rate|rate
Higher_Tax_State|state
Total_States_With_Tax|count
Processing_Visible_Storefront|Yes_or_No
Processing_Default_Status|Yes_or_No
Number_Of_Websites|count
Main_Store_Code|code
Default_Source_Pickup_Status|status
Default_Source_State|state_or_none
Dashboard_Revenue|amount
Tax_Shipping_Zero|yes_or_no
```
**Example Output:**
```
Lifetime_Sales_Amount|$XX.XX
Cheap_Bestseller_Name|Product Name Here
Second_Bestseller_Price|$XX.XX
Second_Bestseller_Quantity|XX
Product_In_Last_Orders|Yes/No
NY_Tax_Rate|X.XXXX
CA_Tax_Rate|X.XXXX
Higher_Tax_State|XX
Total_States_With_Tax|XX
Processing_Visible_Storefront|Yes/No
Processing_Default_Status|Yes/No
Number_Of_Websites|X
Main_Store_Code|code_here
Default_Source_Pickup_Status|Enabled/Disabled
Default_Source_State|State or None
Dashboard_Revenue|$XX.XX
Tax_Shipping_Zero|Yes/No
```
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/label.txt
================================================
Lifetime_Sales_Amount|$0.00
Cheap_Bestseller_Name|Sprite Yoga Strap 6 foot
Second_Bestseller_Price|$14.00
Second_Bestseller_Quantity|6
Product_In_Last_Orders|No
NY_Tax_Rate|8.3750
CA_Tax_Rate|8.2500
Higher_Tax_State|NY
Total_States_With_Tax|2
Processing_Visible_Storefront|Yes
Processing_Default_Status|Yes
Number_Of_Websites|1
Main_Store_Code|main_website_store
Default_Source_Pickup_Status|Enabled
Default_Source_State|No
Dashboard_Revenue|$0.00
Tax_Shipping_Zero|Yes
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/meta.json
================================================
{
"task_id": "ny_expansion_analysis",
"task_name": "NY Expansion Analysis",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Prepare New York market expansion strategy by analyzing regional demographics, evaluating competitor presence, assessing logistics requirements, and creating detailed market entry plan.",
"author": "Fanqing Meng",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/ny_expansion_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("ERROR: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
# Check if file exists
if not Path(messages_path).exists():
print(f"ERROR: Messages file not found at path: {messages_path}", file=sys.stderr)
return None
try:
with open(messages_path, 'r') as f:
content = f.read()
# Check if file is empty
if not content or content.strip() == '""':
print("ERROR: Messages file is empty or contains only empty string", file=sys.stderr)
return None
messages = json.loads(content)
# Check if messages is a list
if not isinstance(messages, list):
print(f"ERROR: Messages file should contain a list, got {type(messages).__name__}", file=sys.stderr)
return None
# Find the last assistant message
for message in reversed(messages):
if message.get('role') == 'assistant' and message.get('status') == 'completed':
content = message.get('content', [])
if not content:
print("WARNING: Assistant message has empty content", file=sys.stderr)
continue
for item in content:
if item.get('type') == 'output_text':
text = item.get('text', '')
if not text:
print("WARNING: Output text is empty", file=sys.stderr)
continue
return text
print("ERROR: No assistant response with output_text found in messages", file=sys.stderr)
return None
except json.JSONDecodeError as e:
print(f"ERROR: Invalid JSON in messages file: {str(e)}", file=sys.stderr)
return None
except Exception as e:
print(f"ERROR: Unexpected error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
print("ERROR: No text provided to parse", file=sys.stderr)
return None
# Look for ... pattern
match = re.search(r'(.*?)', text, re.IGNORECASE | re.DOTALL)
if not match:
print("ERROR: No tags found in the response", file=sys.stderr)
print(f" Response preview: {text[:200]}...", file=sys.stderr)
return None
answer_content = match.group(1).strip()
if not answer_content:
print("ERROR: Empty content between tags", file=sys.stderr)
return None
# Parse each line
result = {}
lines = answer_content.split('\n')
# Expected keys that should be present
expected_keys = [
'Lifetime_Sales_Amount', 'Cheap_Bestseller_Name', 'Second_Bestseller_Price',
'Second_Bestseller_Quantity', 'Product_In_Last_Orders', 'NY_Tax_Rate',
'CA_Tax_Rate', 'Higher_Tax_State', 'Total_States_With_Tax',
'Processing_Visible_Storefront', 'Processing_Default_Status',
'Number_Of_Websites', 'Main_Store_Code', 'Default_Source_Pickup_Status',
'Default_Source_State', 'Dashboard_Revenue', 'Tax_Shipping_Zero'
]
parsed_keys = []
for line in lines:
line = line.strip()
if not line:
continue
if '|' not in line:
print(f"ERROR: Line missing pipe separator '|': {line}", file=sys.stderr)
continue
parts = line.split('|', 1)
if len(parts) != 2:
print(f"ERROR: Invalid line format: {line}", file=sys.stderr)
continue
key, value = parts
key = key.strip()
value = value.strip()
if not key:
print(f"ERROR: Empty key in line: {line}", file=sys.stderr)
continue
result[key] = value
parsed_keys.append(key)
# Check for missing expected keys
missing_keys = set(expected_keys) - set(parsed_keys)
if missing_keys:
print(f"ERROR: Missing expected keys: {', '.join(sorted(missing_keys))}", file=sys.stderr)
# Check for unexpected keys
unexpected_keys = set(parsed_keys) - set(expected_keys)
if unexpected_keys:
print(f"WARNING: Unexpected keys found: {', '.join(sorted(unexpected_keys))}", file=sys.stderr)
if not result:
print("ERROR: No valid key-value pairs parsed from answer", file=sys.stderr)
return None
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, 'r') as f:
lines = f.read().strip().split('\n')
expected = {}
for line in lines:
if '|' in line:
key, value = line.split('|', 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, '')
# Special handling for different types of values
if key in ['Lifetime_Sales_Amount', 'Second_Bestseller_Price', 'Dashboard_Revenue']:
# For price/amount fields, normalize format
expected_clean = expected_value.replace('$', '').replace(',', '')
model_clean = model_value.replace('$', '').replace(',', '')
if expected_clean != model_clean:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key in ['NY_Tax_Rate', 'CA_Tax_Rate']:
# Tax rates - allow different decimal formats
expected_clean = expected_value.replace('%', '').strip()
model_clean = model_value.replace('%', '').strip()
# Convert to float for comparison
try:
if float(expected_clean) != float(model_clean):
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
except ValueError:
if expected_clean != model_clean:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key in ['Product_In_Last_Orders', 'Processing_Visible_Storefront', 'Processing_Default_Status', 'Tax_Shipping_Zero']:
# Yes/No fields - case insensitive
if model_value.lower() != expected_value.lower():
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'Empty_Rows_Yes_Effect':
# Allow flexible descriptions for this field
# Just check if model provided some reasonable description
if not model_value or len(model_value) < 5:
mismatches.append(f"{key}: expected meaningful description, got '{model_value}'")
elif key == 'Order_Status_Options':
# Check if main options are mentioned
expected_options = set(opt.strip() for opt in expected_value.split(','))
model_options = set(opt.strip() for opt in model_value.split(','))
if expected_options != model_options:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
elif key == 'Chart_Disabled_Message':
# Allow some flexibility in message text
# Check for key words
if 'disabled' not in model_value.lower() and 'enable' not in model_value.lower():
mismatches.append(f"{key}: expected message about chart being disabled, got '{model_value}'")
elif key == 'Default_Source_State':
# Handle 'None' or empty state
expected_normalized = expected_value.lower() if expected_value.lower() != 'none' else ''
model_normalized = model_value.lower() if model_value.lower() != 'none' else ''
if expected_normalized != model_normalized:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(f"{key}: expected '{expected_value}', got '{model_value}'")
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the NY expansion analysis task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
print("\n=== Starting Verification ===", file=sys.stderr)
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
print("Loading expected answer from label.txt...", file=sys.stderr)
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr)
return False
print(f"Expected answer loaded with {len(expected_answer)} keys", file=sys.stderr)
# Get model's response from MCP_MESSAGES
print("\nReading model response from MCP_MESSAGES...", file=sys.stderr)
model_response = get_model_response()
if not model_response:
print("FATAL ERROR: No valid model response found", file=sys.stderr)
return False
print(f"Model response found (length: {len(model_response)} chars)", file=sys.stderr)
print("\nParsing answer format from model response...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if not model_answer:
print("FATAL ERROR: Could not parse answer format from model response", file=sys.stderr)
return False
print(f"\n=== Model Answer Parsed Successfully ===", file=sys.stderr)
print(f"Parsed {len(model_answer)} key-value pairs", file=sys.stderr)
for key, value in model_answer.items():
print(f" {key}: {value}", file=sys.stderr)
# Compare answers
print("\n=== Comparing Model Answer with Expected Answer ===", file=sys.stderr)
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nFATAL ERROR: Model answer does not match expected answer", file=sys.stderr)
print("Verification FAILED", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
print("Verification PASSED", file=sys.stderr)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/description.md
================================================
Perform a comprehensive products and sales analysis in the Magento Admin panel to identify inventory status and sales performance metrics.
**Task Requirements:**
1. if need to login, login with username 'admin' and password 'admin1234'
2. Analyze product inventory and catalog details, perform the following:
- Search for all products containing 'Yoga' in their name - count the exact number of results
- Clear the search and find the product with SKU 'WH11' - record its exact price
- Apply a filter to show only products with Quantity = 0.0000 - count how many products match
3. To identify top-selling products and revenue metrics, navigate to the Dashboard and from the Bestsellers table:
- Identify the product with lowest price and lowest quantity - record the product name and quantity sold
- Find the second cheapest product in the table - record its exact quantity sold
- Note the total Revenue amount displayed in the dashboard
4. Father all customers' information and demographics:
- Find customer 'Sarah Miller' - record her exact email address
- Count the total number of customers shown in the grid
5. Review order status and customer purchase history, go to orders of sales:
- Count the total number of orders with 'Pending' status
- Find the order ID of Grace Nguyen's order with the completed status and the most expensive price (starting with "000")
6. To provide a comprehensive report of all gathered data, compile all your findings and output them in the following exact format:
```
YogaProducts|count
WH11Price|price
ZeroQuantityProducts|count
LowestProduct|name:quantity
QuestLumaflexQuantity|quantity
DashboardRevenue|amount
SarahMillerEmail|email
TotalCustomers|count
PendingOrders|count
GraceNguyenOrderID|orderid
```
**Example Output:**
```
YogaProducts|XX
WH11Price|$XX.XX
ZeroQuantityProducts|XX
LowestProduct|Product Name Here:XX
QuestLumaflexQuantity|XX
DashboardRevenue|$XX.XX
SarahMillerEmail|email@example.com
TotalCustomers|XX
PendingOrders|X
GraceNguyenOrderID|00000XXXX
```
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/label.txt
================================================
YogaProducts|171
WH11Price|$54.00
ZeroQuantityProducts|150
LowestProduct|Sprite Stasis Ball 55 cm foot:5
QuestLumaflexQuantity|6
DashboardRevenue|$0.00
SarahMillerEmail|helloworld@yahoo.com
TotalCustomers|72
PendingOrders|10
GraceNguyenOrderID|000000189
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/meta.json
================================================
{
"task_id": "products_sales_analysis",
"task_name": "Products Sales Analysis",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Generate comprehensive sales performance reports by extracting product metrics, analyzing revenue trends, identifying top performers, evaluating inventory turnover, and creating actionable insights.",
"author": "Fanqing Meng",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"data extraction",
"comparative analysis",
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/products_sales_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
):
content = message.get("content", [])
for item in content:
if item.get("type") == "output_text":
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
print("Error: No text provided to parse", file=sys.stderr)
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
print("Error: No ... tags found in response", file=sys.stderr)
return None
answer_content = match.group(1).strip()
if not answer_content:
print("Error: Empty answer content", file=sys.stderr)
return None
# Parse each line
result = {}
lines = [line.strip() for line in answer_content.split("\n") if line.strip()]
if len(lines) != 10:
print(f"Error: Expected 10 lines in answer, got {len(lines)}", file=sys.stderr)
print(f"Lines found: {lines}", file=sys.stderr)
return None
# Expected keys for validation
expected_keys = [
"YogaProducts", "WH11Price", "ZeroQuantityProducts", "LowestProduct",
"QuestLumaflexQuantity", "DashboardRevenue", "SarahMillerEmail",
"TotalCustomers", "PendingOrders", "GraceNguyenOrderID"
]
for line in lines:
if "|" not in line:
print(f"Error: Line missing '|' separator: {line}", file=sys.stderr)
return None
parts = line.split("|", 1)
if len(parts) != 2:
print(f"Error: Invalid line format: {line}", file=sys.stderr)
return None
key, value = parts[0].strip(), parts[1].strip()
if not key or not value:
print(f"Error: Empty key or value in line: {line}", file=sys.stderr)
return None
result[key] = value
# Validate all expected keys are present
missing_keys = set(expected_keys) - set(result.keys())
if missing_keys:
print(f"Error: Missing required keys: {missing_keys}", file=sys.stderr)
return None
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key == "LowestProduct":
# Check if product name and quantity match (format: "Product Name:quantity")
if ":" in expected_value and ":" in model_value:
expected_name, expected_qty = expected_value.rsplit(":", 1)
model_name, model_qty = model_value.rsplit(":", 1)
if expected_name != model_name or expected_qty != model_qty:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key in ["WH11Price", "DashboardRevenue"]:
# For price/amount fields, normalize format
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "SarahMillerEmail":
# Email should match exactly
if model_value.lower() != expected_value.lower():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for other fields
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the products and sales analysis task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("Error: Could not load expected answer from label.txt", file=sys.stderr)
return False
# Get model's response from MCP_MESSAGES
model_response = get_model_response()
if model_response:
print("Found model response, parsing answer format...", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if model_answer:
print("\n=== Model Answer Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f"{key}: {value}", file=sys.stderr)
# Compare answers
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\nModel answer does not match expected answer", file=sys.stderr)
return False
print("\n✓ Model answer matches expected answer", file=sys.stderr)
return True
else:
print(
"Warning: Could not parse answer format from model response",
file=sys.stderr,
)
return False
else:
print("No model response found", file=sys.stderr)
return False
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/description.md
================================================
Perform a comprehensive sales and inventory analysis by extracting specific metrics from multiple sections of the Magento Admin panel.
**Task Requirements:**
1. Login with username 'admin' and password 'admin1234'
2. To analyze product inventory and identify key items, check all products:
- Search for all products containing 'Sprite' in their name - count the exact number of results
- Clear the search and filter products by Quantity = 100.0000 - count how many products match
- Find the product with SKU 'WS12' - record its exact name and price
3. To understand sales performance and order status, we need check all orders:
- Search for all orders with 'Pending' status - count the total number
- Find Grace Nguyen's Complete and the most cheap order - record the order ID (starts with "000")
- Find the order with the highest Grand Total - record the customer name and amount
4. To examine bestselling products and search trends, from the main page:
- In the Bestsellers table, identify the product with most quantity but and lowest price - record its name and quantity sold
- Find 'Overnight Duffle' and record its exact price
- In the Top Search Terms table, find 'hollister' and record its position number (1st, 2nd, etc.)
5. To analyze customer demographics and account information, go to All Customers:
- Search for customers with its email address containing 'costello' - count the results
- Find Sarah Miller's customer record - record her Group and extract Customer Since date
6. To review payment status and billing information, navigate to Invoices:
- Find all invoices with 'Paid' status - count them
- Find the invoice for order #000000002 - record the Bill-to Name
7. To provide a comprehensive report of all gathered data, compile all findings and output them in the following exact format:
```
SpriteProducts|count
Quantity100Products|count
WS12Info|name:price
PendingOrders|count
GraceOrderID|orderid
HighestOrderInfo|customer:amount
CheapProduct|name:quantity
OvernightDufflePrice|price
HollisterPosition|position
CostelloCustomers|count
SarahMillerInfo|group:date
PaidInvoices|count
Invoice002BillTo|name
```
**Example Output:**
```
SpriteProducts|XX
Quantity100Products|XX
WS12Info|Product Name Here:$XX.XX
PendingOrders|X
GraceOrderID|00000XXXX
HighestOrderInfo|Customer Name:$XXX.XX
CheapProduct|Product Name:XX
OvernightDufflePrice|$XX.XX
HollisterPosition|Xth
CostelloCustomers|X
SarahMillerInfo|Group Name:MMM DD, YYYY
PaidInvoices|X
Invoice002BillTo|Customer Name
```
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/label.txt
================================================
SpriteProducts|16
Quantity100Products|1886
WS12Info|Radiant Tee:$22.00
PendingOrders|10
GraceOrderID|000000114
HighestOrderInfo|Samantha Jones:$292.40
CheapProduct|Sprite Yoga Strap 6 foot:6
OvernightDufflePrice|$45.00
HollisterPosition|1st
CostelloCustomers|0
SarahMillerInfo|General:Apr 19, 2023 5:45:07 PM
PaidInvoices|2
Invoice002BillTo|Veronica Costello
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/meta.json
================================================
{
"task_id": "sales_inventory_analysis",
"task_name": "Sales Inventory Analysis",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Analyze sales patterns and inventory levels to optimize stock management, identify slow-moving items, predict demand trends, and generate restocking recommendations.",
"author": "Fanqing Meng",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"data extraction",
"comparative analysis",
"inventory management"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/sales_inventory_analysis/verify.py
================================================
import asyncio
import sys
import re
import os
import json
from pathlib import Path
def get_model_response():
"""
Get the model's response from the MCP_MESSAGES environment variable.
Returns the last assistant message text.
"""
messages_path = os.getenv("MCP_MESSAGES")
print(f"MCP_MESSAGES: {messages_path}")
if not messages_path:
print("Warning: MCP_MESSAGES environment variable not set", file=sys.stderr)
return None
try:
with open(messages_path, "r") as f:
messages = json.load(f)
# Find the last assistant message with type='message', status='completed'
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
):
content = message.get("content", [])
for item in content:
# Check for both 'text' and 'output_text' types
if item.get("type") in ["text", "output_text"]:
return item.get("text", "")
print("Warning: No assistant response found in messages", file=sys.stderr)
return None
except Exception as e:
print(f"Error reading messages file: {str(e)}", file=sys.stderr)
return None
def parse_answer_format(text):
"""
Parse the ... format from the agent's output.
Returns a dictionary with the parsed values.
"""
if not text:
print("ERROR: No text provided to parse", file=sys.stderr)
return None
# Look for ... pattern
match = re.search(r"(.*?)", text, re.IGNORECASE | re.DOTALL)
if not match:
print("ERROR: No ... tags found in the response", file=sys.stderr)
print("Response text preview (first 200 chars):", text[:200], file=sys.stderr)
return None
answer_content = match.group(1).strip()
print(f"Found answer content with {len(answer_content)} characters", file=sys.stderr)
# Parse each line
result = {}
lines = answer_content.split("\n")
# Expected keys for this task
expected_keys = [
"SpriteProducts", "Quantity100Products", "WS12Info", "PendingOrders",
"GraceOrderID", "HighestOrderInfo", "CheapProduct", "OvernightDufflePrice",
"HollisterPosition", "CostelloCustomers", "SarahMillerInfo",
"PaidInvoices", "Invoice002BillTo"
]
if len(lines) != 13:
print(f"ERROR: Expected 13 lines in answer, got {len(lines)}", file=sys.stderr)
print(f"Lines found: {lines}", file=sys.stderr)
return None
for i, line in enumerate(lines, 1):
if "|" not in line:
print(f"ERROR: Line {i} does not contain pipe separator '|': '{line}'", file=sys.stderr)
return None
parts = line.split("|", 1)
if len(parts) != 2:
print(f"ERROR: Line {i} could not be split into key|value: '{line}'", file=sys.stderr)
return None
key, value = parts
result[key.strip()] = value.strip()
# Check if all expected keys are present
missing_keys = set(expected_keys) - set(result.keys())
if missing_keys:
print(f"ERROR: Missing expected keys: {missing_keys}", file=sys.stderr)
print(f"Keys found: {list(result.keys())}", file=sys.stderr)
return None
# Check for unexpected keys
extra_keys = set(result.keys()) - set(expected_keys)
if extra_keys:
print(f"WARNING: Unexpected keys found: {extra_keys}", file=sys.stderr)
return result
def load_expected_answer(label_path):
"""
Load the expected answer from label.txt file.
Returns a dictionary with the expected values.
"""
try:
with open(label_path, "r") as f:
lines = f.read().strip().split("\n")
expected = {}
for line in lines:
if "|" in line:
key, value = line.split("|", 1)
expected[key.strip()] = value.strip()
return expected
except Exception as e:
print(f"Error reading label file: {str(e)}", file=sys.stderr)
return None
def compare_answers(model_answer, expected_answer):
"""
Compare the model's answer with the expected answer.
Returns True if all key information matches, False otherwise.
"""
if not model_answer or not expected_answer:
return False
# Check each expected key
mismatches = []
for key, expected_value in expected_answer.items():
model_value = model_answer.get(key, "")
# Special handling for different types of values
if key == "WS12Info":
# Check if product name and price match (format: name:price)
if ":" in expected_value and ":" in model_value:
expected_name, expected_price = expected_value.rsplit(":", 1)
model_name, model_price = model_value.rsplit(":", 1)
# Normalize price format
expected_price_clean = expected_price.replace("$", "").replace(",", "")
model_price_clean = model_price.replace("$", "").replace(",", "")
if (
expected_name != model_name
or expected_price_clean != model_price_clean
):
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "GraceOrderID":
# Order ID should start with "000" and match exactly
if not model_value.startswith("000"):
mismatches.append(
f"{key}: expected to start with '000', got '{model_value}'"
)
elif model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "HighestOrderInfo":
# Check format customer:amount
if ":" in expected_value and ":" in model_value:
expected_customer, expected_amount = expected_value.rsplit(":", 1)
model_customer, model_amount = model_value.rsplit(":", 1)
# Normalize amount format
expected_amount_clean = expected_amount.replace("$", "").replace(
",", ""
)
model_amount_clean = model_amount.replace("$", "").replace(",", "")
if (
expected_customer != model_customer
or expected_amount_clean != model_amount_clean
):
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "Position2Product":
# Check if product name and quantity match
if ":" in expected_value and ":" in model_value:
expected_name, expected_qty = expected_value.rsplit(":", 1)
model_name, model_qty = model_value.rsplit(":", 1)
if expected_name != model_name or expected_qty != model_qty:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "OvernightDufflePrice":
# Normalize price format
expected_clean = expected_value.replace("$", "").replace(",", "")
model_clean = model_value.replace("$", "").replace(",", "")
if expected_clean != model_clean:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "HollisterPosition":
# Position format (1st, 2nd, 3rd, etc.)
if model_value.lower() != expected_value.lower():
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "SarahMillerInfo":
# Format: group:date
if ":" in expected_value and ":" in model_value:
expected_group, expected_date = expected_value.split(":", 1)
model_group, model_date = model_value.split(":", 1)
# Allow some flexibility in date format
if expected_group != model_group:
mismatches.append(
f"{key}: expected group '{expected_group}', got '{model_group}'"
)
# For date, check if key parts match
if not (expected_date in model_date or model_date in expected_date):
mismatches.append(
f"{key}: expected date '{expected_date}', got '{model_date}'"
)
else:
if expected_value != model_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
elif key == "Invoice002BillTo":
# Name should match exactly
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
else:
# Exact match for count fields and other numeric values
if model_value != expected_value:
mismatches.append(
f"{key}: expected '{expected_value}', got '{model_value}'"
)
if mismatches:
print("\n=== Answer Comparison Mismatches ===", file=sys.stderr)
for mismatch in mismatches:
print(f"✗ {mismatch}", file=sys.stderr)
return False
print("\n=== Answer Comparison ===", file=sys.stderr)
print("✓ All key information matches the expected answer", file=sys.stderr)
return True
async def verify() -> bool:
"""
Verifies that the sales and inventory analysis task has been completed correctly.
First checks the model's answer against the expected label,
then optionally verifies the actual state in the Magento Admin.
"""
print("\n" + "="*60, file=sys.stderr)
print("Starting verification of Task 5", file=sys.stderr)
print("="*60, file=sys.stderr)
# Get the label file path
label_path = Path(__file__).parent / "label.txt"
# Load expected answer
print("\n--- Loading Expected Answer ---", file=sys.stderr)
expected_answer = load_expected_answer(label_path)
if not expected_answer:
print("FATAL ERROR: Could not load expected answer from label.txt", file=sys.stderr)
return False
print(f"Successfully loaded {len(expected_answer)} expected values", file=sys.stderr)
# Get model's response from MCP_MESSAGES
print("\n--- Loading Model Response ---", file=sys.stderr)
model_response = get_model_response()
if not model_response:
print("FATAL ERROR: No model response found in MCP_MESSAGES", file=sys.stderr)
return False
print(f"Found model response ({len(model_response)} characters)", file=sys.stderr)
print("\n--- Parsing Answer Format ---", file=sys.stderr)
model_answer = parse_answer_format(model_response)
if not model_answer:
print("\nFATAL ERROR: Could not parse answer format from model response", file=sys.stderr)
print("Verification FAILED", file=sys.stderr)
return False
print("\n=== Model Answer Successfully Parsed ===", file=sys.stderr)
for key, value in model_answer.items():
print(f" {key}: {value}", file=sys.stderr)
# Compare answers
print("\n--- Comparing Answers ---", file=sys.stderr)
answer_match = compare_answers(model_answer, expected_answer)
if not answer_match:
print("\n" + "="*60, file=sys.stderr)
print("VERIFICATION FAILED: Model answer does not match expected answer", file=sys.stderr)
print("="*60, file=sys.stderr)
return False
print("\n" + "="*60, file=sys.stderr)
print("✓ VERIFICATION PASSED: Model answer matches expected answer", file=sys.stderr)
print("="*60, file=sys.stderr)
return True
def main():
"""
Executes the verification process and exits with a status code.
"""
result = asyncio.run(verify())
sys.exit(0 if result else 1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/description.md
================================================
Perform comprehensive search and filtering operations in the Magento Admin panel to extract specific business insights using advanced search techniques.
**Task Requirements:**
1. Login with username 'admin' and password 'admin1234'
2. To analyze search behavior and term effectiveness, check the Search Terms of Marketing and perform complex filtering:
- Search for all terms containing 'tank' in their name - count the exact number of results
- Clear filters and find terms with exactly 0 results - count how many such terms exist
- Apply a filter to show only terms with more than 10 uses - record the term with highest uses and its count (You need to see how many there are and record them all.)
- Find the search term that has results between 20-30 - record its name and exact result count
3. To gather detailed marketing insights from search data, go to Search Terms in Reports:
- Apply filter for terms with more than 15 hits - count total filtered results
- Find the term with ID between 10-15 that has the most results - record term name and result count (You need to see how many there are and record them all.)
- Filter to show only terms from "Default Store View" - count total results
4. To examine real-time search trends and top performers, from the Dashboard, perform targeted searches:
- In the 'Top Search Terms' table, find the term with exactly 1 result - record its name and uses
- In the 'Last Search Terms' table, identify the term with the both the highest number of results and uses - record name and the number of results
- In the 'Bestsellers' tab, find the product at position #3 - record name and quantity
5. To identify patterns in search usage and results, navigate to Search Terms (main grid) in step 2:
- Sort by 'Uses' column (descending) - record the top term and its uses count
- Sort by 'Results' column (ascending) - record the first non-zero result term and its count
- Count total number of unique search terms in the system
6. To provide a comprehensive report of all gathered data, compile all findings and output in the following exact format:
```
TankSearchCount|count
ZeroResultsCount|count
HighestUseTerm|term:uses
Results20to30Term|term1:results1|term2:result2|term3:result3|...
Hits15PlusCount|count
ID10to15MaxResults|term:results
DefaultStoreViewCount|count
OneResultTerm|term1:uses1|term2:uses2|term3:uses3|...
HighestResultLastSearch|term:results
Position3Bestseller|product:quantity
TopUseTerm|term:uses
FirstNonZeroResult|term:results
TotalUniqueTerms|count
```
**Example Output:**
```
TankSearchCount|X
ZeroResultsCount|X
HighestUseTerm|search_term:XX
Results20to30Term|search_term1:XX1|search_term2:XX2|search_term3:XX3|...
Hits15PlusCount|X
ID10to15MaxResults|Product Name:XX
DefaultStoreViewCount|X
OneResultTerm|search_term1:XX1|search_term2:XX2|search_term3:XX3|...
HighestResultLastSearch|search_term:XX
Position3Bestseller|Product Name:X
TopUseTerm|search_term:XX
FirstNonZeroResult|search_term:X
TotalUniqueTerms|X
```
**Success Criteria:**
- Successfully logged into Magento Admin
- Applied complex search filters in Search Terms section
- Used range filters for results and hits
- Sorted columns to find specific records
- Navigated between different report views
- Extracted data from filtered and sorted results
- Counted records accurately after applying filters
- Output answer in exact format with 13 data lines
- Answer wrapped in tags
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/label.txt
================================================
TankSearchCount|2
ZeroResultsCount|1
HighestUseTerm|hollister:19
Results20to30Term|Antonia Racer Tank:23|tanks:23
Hits15PlusCount|1
ID10to15MaxResults|Antonia Racer Tank:23
DefaultStoreViewCount|7
OneResultTerm|hollister:19|WP10:1
HighestResultLastSearch|Antonia Racer Tank:23
Position3Bestseller|Sprite Stasis Ball 65 cm:6
TopUseTerm|hollister:19
FirstNonZeroResult|WP10:1
TotalUniqueTerms|7
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/meta.json
================================================
{
"task_id": "search_filtering_operations",
"task_name": "Search Filtering Operations",
"category_id": "shopping_admin",
"category_name": "Shopping Admin",
"description": "Configure advanced search and filtering systems in admin interface, implement category hierarchies, set up attribute filters, and optimize search algorithms for user experience.",
"author": "Fanqing Meng",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"content submission"
],
"mcp": [
"playwright"
],
"meta_data": {
"stateType": "video",
"stateContent": null,
"stateUrl": "https://storage.mcpmark.ai/tasks_state/playwright_video/magento-admin.mp4",
"stateOriginalUrl": "https://github.com/web-arena-x/webarena/tree/main/environment_docker"
}
}
================================================
FILE: tasks/playwright_webarena/standard/shopping_admin/search_filtering_operations/verify.py
================================================
import re
import json
import os
import sys
def verify(messages):
"""
Verify that the agent has successfully performed complex search and filtering operations
in the Magento Admin panel and extracted all required information correctly.
Args:
messages: List of message dictionaries containing the conversation
Returns:
Dictionary with 'valid' boolean and 'reason' string
"""
# Find the last assistant message with status "completed" and type "message"
answer_content = None
for message in reversed(messages):
if (
message.get("role") == "assistant"
and message.get("status") == "completed"
and message.get("type") == "message"
and message.get("content")
):
# Extract text from content structure
content = message["content"]
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get("type") == "output_text":
text = item.get("text", "")
# Look for answer tags with case-insensitive search
answer_match = re.search(
r"(.*?)", text, re.DOTALL | re.IGNORECASE
)
if answer_match:
answer_content = answer_match.group(1).strip()
break
elif isinstance(content, str):
# Look for answer tags in string content
answer_match = re.search(r"(.*?)", content, re.DOTALL | re.IGNORECASE)
if answer_match:
answer_content = answer_match.group(1).strip()
break
if answer_content:
break
if not answer_content:
return {"valid": False, "reason": "No answer found in tags"}
# Expected format - each line should have a key|value pair
expected_keys = [
"TankSearchCount",
"ZeroResultsCount",
"HighestUseTerm",
"Results20to30Term",
"Hits15PlusCount",
"ID10to15MaxResults",
"DefaultStoreViewCount",
"OneResultTerm",
"HighestResultLastSearch",
"Position3Bestseller",
"TopUseTerm",
"FirstNonZeroResult",
"TotalUniqueTerms",
]
# Parse the answer
lines = answer_content.strip().split("\n")
# Check if we have exactly 13 lines
if len(lines) != 13:
return {"valid": False, "reason": f"Expected 13 data lines, found {len(lines)}"}
# Parse each line and validate format
extracted_data = {}
for line in lines:
if "|" not in line:
return {
"valid": False,
"reason": f"Invalid format in line: {line}. Expected 'key|value' format",
}
parts = line.split("|", 1)
if len(parts) != 2:
return {"valid": False, "reason": f"Invalid format in line: {line}"}
key, value = parts
extracted_data[key] = value
# Check all required keys are present
missing_keys = set(expected_keys) - set(extracted_data.keys())
if missing_keys:
return {
"valid": False,
"reason": f"Missing required keys: {', '.join(missing_keys)}",
}
# Validate specific data formats and expected values based on the current data
# 1. TankSearchCount should be a number (2 terms containing 'tank')
if not extracted_data["TankSearchCount"].isdigit():
return {
"valid": False,
"reason": f"TankSearchCount should be a number, got: {extracted_data['TankSearchCount']}",
}
# Expected: "Antonia Racer Tank" and "tanks" contain 'tank'
if extracted_data["TankSearchCount"] != "2":
return {
"valid": False,
"reason": f"TankSearchCount should be '2', got: {extracted_data['TankSearchCount']}",
}
# 2. ZeroResultsCount should be a number (nike has 0 results)
if not extracted_data["ZeroResultsCount"].isdigit():
return {
"valid": False,
"reason": f"ZeroResultsCount should be a number, got: {extracted_data['ZeroResultsCount']}",
}
if extracted_data["ZeroResultsCount"] != "1":
return {
"valid": False,
"reason": f"ZeroResultsCount should be '1', got: {extracted_data['ZeroResultsCount']}",
}
# 3. HighestUseTerm should be in format "term:uses"
if ":" not in extracted_data["HighestUseTerm"]:
return {
"valid": False,
"reason": f"HighestUseTerm should be in format 'term:uses', got: {extracted_data['HighestUseTerm']}",
}
# hollister has 19 uses (highest among terms with > 10 uses)
if extracted_data["HighestUseTerm"] != "hollister:19":
return {
"valid": False,
"reason": f"HighestUseTerm should be 'hollister:19', got: {extracted_data['HighestUseTerm']}",
}
# 4. Results20to30Term should be in format "term:results"
if ":" not in extracted_data["Results20to30Term"]:
return {
"valid": False,
"reason": f"Results20to30Term should be in format 'term:results', got: {extracted_data['Results20to30Term']}",
}
# Both "tanks" and "Antonia Racer Tank" have 23 results (between 20-30)
valid_results20to30 = ["tanks:23", "Antonia Racer Tank:23"]
# Check if answer contains one of the valid values or both separated by |
if not any(
val in extracted_data["Results20to30Term"] for val in valid_results20to30
):
return {
"valid": False,
"reason": f"Results20to30Term should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['Results20to30Term']}",
}
# 5. Hits15PlusCount should be a number (only hollister has 19 hits > 15)
if not extracted_data["Hits15PlusCount"].isdigit():
return {
"valid": False,
"reason": f"Hits15PlusCount should be a number, got: {extracted_data['Hits15PlusCount']}",
}
if extracted_data["Hits15PlusCount"] != "1":
return {
"valid": False,
"reason": f"Hits15PlusCount should be '1', got: {extracted_data['Hits15PlusCount']}",
}
# 6. ID10to15MaxResults should be in format "term:results"
if ":" not in extracted_data["ID10to15MaxResults"]:
return {
"valid": False,
"reason": f"ID10to15MaxResults should be in format 'term:results', got: {extracted_data['ID10to15MaxResults']}",
}
# ID 11 is hollister (1 result), ID 13 is Antonia Racer Tank (23 results)
if extracted_data["ID10to15MaxResults"] != "Antonia Racer Tank:23":
return {
"valid": False,
"reason": f"ID10to15MaxResults should be 'Antonia Racer Tank:23', got: {extracted_data['ID10to15MaxResults']}",
}
# 7. DefaultStoreViewCount should be a number (all 7 terms are from Default Store View)
if not extracted_data["DefaultStoreViewCount"].isdigit():
return {
"valid": False,
"reason": f"DefaultStoreViewCount should be a number, got: {extracted_data['DefaultStoreViewCount']}",
}
if extracted_data["DefaultStoreViewCount"] != "7":
return {
"valid": False,
"reason": f"DefaultStoreViewCount should be '7', got: {extracted_data['DefaultStoreViewCount']}",
}
# 8. OneResultTerm should be in format "term:uses"
if ":" not in extracted_data["OneResultTerm"]:
return {
"valid": False,
"reason": f"OneResultTerm should be in format 'term:uses', got: {extracted_data['OneResultTerm']}",
}
# Both hollister and WP10 have exactly 1 result
valid_one_result = ["hollister:19", "WP10:1"]
if not any(val in extracted_data["OneResultTerm"] for val in valid_one_result):
return {
"valid": False,
"reason": f"OneResultTerm should contain 'hollister:19' or 'WP10:1', got: {extracted_data['OneResultTerm']}",
}
# 9. HighestResultLastSearch should be in format "term:results"
if ":" not in extracted_data["HighestResultLastSearch"]:
return {
"valid": False,
"reason": f"HighestResultLastSearch should be in format 'term:results', got: {extracted_data['HighestResultLastSearch']}",
}
# In Last Search Terms: tanks and Antonia Racer Tank both have 23 results (highest)
valid_highest_last = ["tanks:23", "Antonia Racer Tank:23"]
if not any(
val in extracted_data["HighestResultLastSearch"] for val in valid_highest_last
):
return {
"valid": False,
"reason": f"HighestResultLastSearch should contain 'tanks:23' or 'Antonia Racer Tank:23', got: {extracted_data['HighestResultLastSearch']}",
}
# 10. Position3Bestseller should be in format "product:quantity"
if ":" not in extracted_data["Position3Bestseller"]:
return {
"valid": False,
"reason": f"Position3Bestseller should be in format 'product:quantity', got: {extracted_data['Position3Bestseller']}",
}
# Position 3 in Bestsellers is "Sprite Stasis Ball 65 cm" with quantity 6
if extracted_data["Position3Bestseller"] != "Sprite Stasis Ball 65 cm:6":
return {
"valid": False,
"reason": f"Position3Bestseller should be 'Sprite Stasis Ball 65 cm:6', got: {extracted_data['Position3Bestseller']}",
}
# 11. TopUseTerm should be in format "term:uses"
if ":" not in extracted_data["TopUseTerm"]:
return {
"valid": False,
"reason": f"TopUseTerm should be in format 'term:uses', got: {extracted_data['TopUseTerm']}",
}
# hollister has 19 uses (highest)
if extracted_data["TopUseTerm"] != "hollister:19":
return {
"valid": False,
"reason": f"TopUseTerm should be 'hollister:19', got: {extracted_data['TopUseTerm']}",
}
# 12. FirstNonZeroResult should be in format "term:results"
if ":" not in extracted_data["FirstNonZeroResult"]:
return {
"valid": False,
"reason": f"FirstNonZeroResult should be in format 'term:results', got: {extracted_data['FirstNonZeroResult']}",
}
# When sorted by results ascending, first non-zero is WP10 (has 1 result)
if extracted_data["FirstNonZeroResult"] != "WP10:1":
return {
"valid": False,
"reason": f"FirstNonZeroResult should be 'WP10:1', got: {extracted_data['FirstNonZeroResult']}",
}
# 13. TotalUniqueTerms should be a number
if not extracted_data["TotalUniqueTerms"].isdigit():
return {
"valid": False,
"reason": f"TotalUniqueTerms should be a number, got: {extracted_data['TotalUniqueTerms']}",
}
# There are 7 unique search terms in the system
if extracted_data["TotalUniqueTerms"] != "7":
return {
"valid": False,
"reason": f"TotalUniqueTerms should be '7', got: {extracted_data['TotalUniqueTerms']}",
}
# All validations passed
return {
"valid": True,
"reason": "All complex search and filtering operations completed successfully",
}
if __name__ == "__main__":
# Load messages from environment variable
messages_path = os.getenv("MCP_MESSAGES")
if not messages_path:
print(
json.dumps(
{"valid": False, "reason": "MCP_MESSAGES environment variable not set"}
)
)
exit(1)
try:
with open(messages_path, "r") as f:
messages = json.load(f)
except Exception as e:
print(
json.dumps({"valid": False, "reason": f"Failed to load messages: {str(e)}"})
)
exit(1)
# Run verification
result = verify(messages)
print(json.dumps(result))
# Exit with appropriate code based on verification result
sys.exit(0 if result["valid"] else 1)
================================================
FILE: tasks/postgres/easy/.gitkeep
================================================
================================================
FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/description.md
================================================
Migrate customer data from an acquired company to PostgreSQL using efficient bulk operations.
## Your Mission:
Chinook Music Store has recently acquired "MelodyMart," a competing music retailer. Their customer database needs to be migrated into Chinook's PostgreSQL database.
## Migration Requirements:
1. **Process all customer records from the data table below** and migrate them into the `Customer` table
2. **Apply business logic during migration**:
- Assign `CustomerID` values starting from the next available ID
- Assign all customers to support representative with EmployeeId 3
- Set `Fax` field to NULL for all migrated customers
## Customer Data to Migrate:
| FirstName | LastName | Company | Address | City | State | Country | PostalCode | Phone | Email |
|-----------|----------|---------|---------|------|-------|---------|------------|-------|--------|
| Danielle | Johnson | Sanchez-Taylor | 819 Johnson Course | East William | AK | USA | 74064 | 386-3794 | danielle.johnson@sancheztaylor.com |
| Katherine | Moore | Peterson-Moore | 16155 Roman Stream Suite 816 | New Kellystad | OK | USA | 25704 | 103-4131 | katherine_moore@petersonmoore.org |
| Joshua | Reid | Martin-Kelly | 192 Frank Light Suite 835 | East Lydiamouth | MO | USA | 35594 | 139-5376 | joshua_reid@martinkelly.org |
================================================
FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/meta.json
================================================
{
"task_id": "customer_data_migration_basic",
"task_name": "Customer Data Migration Basic",
"category_id": "chinook",
"category_name": "Chinook",
"description": "Load the MelodyMart customer rows into the Customer table with new ids, SupportRepId = 3, and Fax values set to NULL.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data migration",
"transactional operations"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
}
}
================================================
FILE: tasks/postgres/easy/chinook/customer_data_migration_basic/verify.py
================================================
"""
Verification script for PostgreSQL Task 2: Customer Data Migration
"""
import os
import sys
import psycopg2
import pickle
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def load_expected_customers():
"""Load the expected customer data from pickle file."""
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
pkl_path = os.path.join(script_dir, 'customer_data.pkl')
try:
with open(pkl_path, 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
print(f"❌ customer_data.pkl not found at {pkl_path}. Please generate customer data first.")
return None
except Exception as e:
print(f"❌ Error loading customer data: {e}")
return None
def verify_migrated_customers(conn, expected_customers) -> bool:
"""Verify migrated customers by comparing with expected data as sets."""
with conn.cursor() as cur:
# Get all customers with ID > 59 (the migrated ones)
cur.execute('''
SELECT "FirstName", "LastName", "Company", "Address", "City",
"State", "Country", "PostalCode", "Phone", "Email",
"SupportRepId", "Fax"
FROM "Customer"
WHERE "CustomerId" > 59
''')
actual_customers = cur.fetchall()
if len(actual_customers) != len(expected_customers):
print(f"❌ Expected {len(expected_customers)} migrated customers, found {len(actual_customers)}")
return False
# Convert expected customers to tuples for set comparison
expected_tuples = set()
for expected in expected_customers:
expected_tuple = (
expected['FirstName'], expected['LastName'], expected['Company'],
expected['Address'], expected['City'], expected['State'],
expected['Country'], expected['PostalCode'], expected['Phone'],
expected['Email'], 3, None # SupportRepId=3, Fax=None
)
expected_tuples.add(expected_tuple)
# Convert actual customers to set with proper type conversion
actual_tuples = set()
for row in actual_customers:
# Convert all fields to strings for consistent comparison
actual_tuple = (
str(row[0]) if row[0] is not None else '', # FirstName
str(row[1]) if row[1] is not None else '', # LastName
str(row[2]) if row[2] is not None else '', # Company
str(row[3]) if row[3] is not None else '', # Address
str(row[4]) if row[4] is not None else '', # City
str(row[5]) if row[5] is not None else '', # State
str(row[6]) if row[6] is not None else '', # Country
str(row[7]) if row[7] is not None else '', # PostalCode
str(row[8]) if row[8] is not None else '', # Phone
str(row[9]) if row[9] is not None else '', # Email
int(row[10]) if row[10] is not None else None, # SupportRepId
row[11] # Fax (should be None)
)
actual_tuples.add(actual_tuple)
# Check if sets are equal
if actual_tuples != expected_tuples:
missing_in_actual = expected_tuples - actual_tuples
extra_in_actual = actual_tuples - expected_tuples
print(f"❌ Customer data sets don't match!")
if missing_in_actual:
print(f" Missing {len(missing_in_actual)} expected customers")
for missing in list(missing_in_actual)[:3]: # Show first 3
print(f" Missing: {missing[0]} {missing[1]} - {missing[2]}")
if len(missing_in_actual) > 3:
print(f" ... and {len(missing_in_actual) - 3} more")
if extra_in_actual:
print(f" Found {len(extra_in_actual)} unexpected customers")
for extra in list(extra_in_actual)[:3]: # Show first 3
print(f" Extra: {extra[0]} {extra[1]} - {extra[2]}")
if len(extra_in_actual) > 3:
print(f" ... and {len(extra_in_actual) - 3} more")
return False
print(f"✅ All {len(expected_customers)} customers migrated correctly")
print(f"✅ All customers assigned to SupportRepId 3")
print(f"✅ All customers have Fax field set to NULL")
print(f"✅ Customer data sets match exactly (order-independent)")
return True
def main():
"""Main verification function."""
print("=" * 60)
print("Verifying Customer Data Migration Task")
print("=" * 60)
# Load expected customer data
expected_customers = load_expected_customers()
if not expected_customers:
sys.exit(1)
print(f"Loaded {len(expected_customers)} expected customer records")
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify migration
success = verify_migrated_customers(conn, expected_customers)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/chinook/update_employee_info/description.md
================================================
Update employee information and reorganize the reporting structure in the Chinook database to reflect organizational changes.
## Your Tasks:
### **UPDATE: Modify Existing Employee Information**
- Change Andrew Adams (EmployeeId = 1) title from 'General Manager' to 'CEO'
- Update Nancy Edwards (EmployeeId = 2) phone number to '+1 (403) 555-9999'
- Change all employees with Title = 'IT Staff' to have Title = 'IT Specialist'
## Requirements:
- Use UPDATE statements to modify the existing records
- The title update for 'IT Staff' should affect all matching employees
## Expected Results:
After completing the updates:
- Andrew Adams should have Title = 'CEO'
- Nancy Edwards should have Phone = '+1 (403) 555-9999'
- All employees previously with Title = 'IT Staff' should now have Title = 'IT Specialist'
This task practices UPDATE operations for both employee information and organizational hierarchy management.
================================================
FILE: tasks/postgres/easy/chinook/update_employee_info/meta.json
================================================
{
"task_id": "update_employee_info",
"task_name": "Update Employee Info",
"category_id": "chinook",
"category_name": "Chinook",
"description": "Update Chinook employee records so Andrew Adams becomes CEO, Nancy Edwards receives the new phone number, and every \"IT Staff\" title becomes \"IT Specialist.\"",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data updates",
"organizational change"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
}
}
================================================
FILE: tasks/postgres/easy/chinook/update_employee_info/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Employee Hierarchy Management
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.01 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.01:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_employee_count_and_titles(conn) -> bool:
"""Verify the final employee count and title changes."""
with conn.cursor() as cur:
# Check the final verification query results
cur.execute("""
SELECT
COUNT(*) as total_employees,
COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count,
COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count
FROM "Employee"
""")
result = cur.fetchone()
total_employees, ceo_count, it_specialist_count = result
if total_employees != 8:
print(f"❌ Expected 8 total employees, got {total_employees}")
return False
if ceo_count != 1:
print(f"❌ Expected 1 CEO, got {ceo_count}")
return False
if it_specialist_count != 2:
print(f"❌ Expected 2 IT Specialists, got {it_specialist_count}")
return False
print("✅ Employee count and title verification passed")
return True
def verify_specific_employees(conn) -> bool:
"""Verify specific employee records and modifications."""
with conn.cursor() as cur:
# Check all employee fields in one query
cur.execute("""
SELECT "EmployeeId", "LastName", "FirstName", "Title", "ReportsTo", "BirthDate",
"HireDate", "Address", "City", "State", "Country", "PostalCode",
"Phone", "Fax", "Email"
FROM "Employee"
WHERE "EmployeeId" IN (1, 2)
ORDER BY "EmployeeId"
""")
employees = cur.fetchall()
from datetime import datetime
expected = [
# Andrew Adams (ID 1) - Title changes to 'CEO', phone stays original, ReportsTo stays None
(1, 'Adams', 'Andrew', 'CEO', None, datetime(1962, 2, 18), datetime(2002, 8, 14),
'11120 Jasper Ave NW', 'Edmonton', 'AB', 'Canada', 'T5K 2N1', '+1 (780) 428-9482', '+1 (780) 428-3457', 'andrew@chinookcorp.com'),
# Nancy Edwards (ID 2) - Phone changes, title stays 'Sales Manager', ReportsTo stays 1
(2, 'Edwards', 'Nancy', 'Sales Manager', 1, datetime(1958, 12, 8), datetime(2002, 5, 1),
'825 8 Ave SW', 'Calgary', 'AB', 'Canada', 'T2P 2T3', '+1 (403) 555-9999', '+1 (403) 262-3322', 'nancy@chinookcorp.com'),
]
if len(employees) != 2:
print(f"❌ Expected 2 key employees, found {len(employees)}")
return False
# Full field comparison for all employees using rows_match
for actual, expected_emp in zip(employees, expected):
if not rows_match(actual, expected_emp):
print(f"❌ Employee {actual[0]} row mismatch: expected {expected_emp}, got {actual}")
return False
print("✅ Specific employee verification passed - all fields match exactly")
return True
def main():
"""Main verification function."""
print("=" * 50)
print("Verifying Task 3: Employee Hierarchy Management")
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Run verification checks with short-circuit evaluation
success = (
verify_employee_count_and_titles(conn) and
verify_specific_employees(conn)
)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
print("All employee hierarchy management operations completed correctly!")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/dvdrental/create_payment_index/description.md
================================================
Create an index to optimize customer payment queries in the DVD rental database.
## Your Task:
Create an index on the `customer_id` column of the `payment` table to improve query performance.
## Requirements:
- Create an index on the `payment` table's `customer_id` column
- The index name can be anything you choose (e.g., `idx_payment_customer_id`)
- Use the standard CREATE INDEX syntax
## Why This Helps:
The `customer_id` column is frequently used in:
- JOIN operations between customer and payment tables
- WHERE clauses filtering by customer
- Subqueries that look up payments for specific customers
Adding an index will significantly speed up these operations.
================================================
FILE: tasks/postgres/easy/dvdrental/create_payment_index/meta.json
================================================
{
"task_id": "create_payment_index",
"task_name": "Create Payment Index",
"category_id": "dvdrental",
"category_name": "DVD Rental",
"description": "Add an index on payment.customer_id to speed up the customer payment lookups in the DVD Rental database.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"performance optimization",
"indexing"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"mpaa_rating\" {\n \"G\"\n \"PG\"\n \"PG-13\"\n \"R\"\n \"NC-17\"\n}\n\nTable \"customer\" {\n \"customer_id\" int4 [pk, not null, increment]\n \"store_id\" int2 [not null]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"email\" varchar(50)\n \"address_id\" int2 [not null]\n \"activebool\" bool [not null, default: true]\n \"create_date\" date [not null, default: `('now'::text)::date`]\n \"last_update\" timestamp [default: `now()`]\n \"active\" int4\n\n Indexes {\n address_id [type: btree, name: \"idx_fk_address_id\"]\n store_id [type: btree, name: \"idx_fk_store_id\"]\n last_name [type: btree, name: \"idx_last_name\"]\n }\n}\n\nTable \"actor\" {\n \"actor_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n last_name [type: btree, name: \"idx_actor_last_name\"]\n }\n}\n\nTable \"category\" {\n \"category_id\" int4 [pk, not null, increment]\n \"name\" varchar(25) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n \"film_id\" int4 [pk, not null, increment]\n \"title\" varchar(255) [not null]\n \"description\" text\n \"release_year\" int4\n \"language_id\" int2 [not null]\n \"rental_duration\" int2 [not null, default: 3]\n \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n \"length\" int2\n \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n \"rating\" mpaa_rating [default: 'G']\n \"last_update\" timestamp [not null, default: `now()`]\n \"special_features\" \"text[]\"\n \"fulltext\" tsvector [not null]\n\n Indexes {\n fulltext [type: gist, name: \"film_fulltext_idx\"]\n language_id [type: btree, name: \"idx_fk_language_id\"]\n title [type: btree, name: \"idx_title\"]\n }\n}\n\nTable \"film_actor\" {\n \"actor_id\" int2 [not null]\n \"film_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n film_id [type: btree, name: \"idx_fk_film_id\"]\n }\n}\n\nTable \"film_category\" {\n \"film_id\" int2 [not null]\n \"category_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n }\n}\n\nTable \"address\" {\n \"address_id\" int4 [pk, not null, increment]\n \"address\" varchar(50) [not null]\n \"address2\" varchar(50)\n \"district\" varchar(20) [not null]\n \"city_id\" int2 [not null]\n \"postal_code\" varchar(10)\n \"phone\" varchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n city_id [type: btree, name: \"idx_fk_city_id\"]\n }\n}\n\nTable \"city\" {\n \"city_id\" int4 [pk, not null, increment]\n \"city\" varchar(50) [not null]\n \"country_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n country_id [type: btree, name: \"idx_fk_country_id\"]\n }\n}\n\nTable \"country\" {\n \"country_id\" int4 [pk, not null, increment]\n \"country\" varchar(50) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n \"inventory_id\" int4 [pk, not null, increment]\n \"film_id\" int2 [not null]\n \"store_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n }\n}\n\nTable \"language\" {\n \"language_id\" int4 [pk, not null, increment]\n \"name\" bpchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n \"payment_id\" int4 [pk, not null, increment]\n \"customer_id\" int2 [not null]\n \"staff_id\" int2 [not null]\n \"rental_id\" int4 [not null]\n \"amount\" numeric(5,2) [not null]\n \"payment_date\" timestamp [not null]\n\n Indexes {\n rental_id [type: btree, name: \"idx_fk_rental_id\"]\n staff_id [type: btree, name: \"idx_fk_staff_id\"]\n }\n}\n\nTable \"rental\" {\n \"rental_id\" int4 [pk, not null, increment]\n \"rental_date\" timestamp [not null]\n \"inventory_id\" int4 [not null]\n \"customer_id\" int2 [not null]\n \"return_date\" timestamp\n \"staff_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n }\n}\n\nTable \"staff\" {\n \"staff_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"address_id\" int2 [not null]\n \"email\" varchar(50)\n \"store_id\" int2 [not null]\n \"active\" bool [not null, default: true]\n \"username\" varchar(16) [not null]\n \"password\" varchar(40)\n \"last_update\" timestamp [not null, default: `now()`]\n \"picture\" bytea\n}\n\nTable \"store\" {\n \"store_id\" int4 [pk, not null, increment]\n \"manager_staff_id\" int2 [unique, not null]\n \"address_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project"
}
}
================================================
FILE: tasks/postgres/easy/dvdrental/create_payment_index/verify.py
================================================
"""
Verification script for PostgreSQL Task 1: Customer Payment Query Optimization
"""
import os
import sys
import psycopg2
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def check_payment_customer_id_index(conn) -> bool:
"""Check if there's any index on payment.customer_id column."""
with conn.cursor() as cur:
cur.execute("""
SELECT indexname, indexdef
FROM pg_indexes
WHERE schemaname = 'public'
AND tablename = 'payment'
AND indexdef LIKE '%customer_id%'
""")
indexes = cur.fetchall()
print(indexes)
return len(indexes) > 0, indexes
def main():
"""Main verification function."""
print("=" * 60)
print("PostgreSQL Task 1 Verification: Customer Payment Query Optimization")
print("=" * 60)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
print("\n🔍 Checking for customer_id index on payment table...")
# Check if any index exists on payment.customer_id
has_index, indexes = check_payment_customer_id_index(conn)
if has_index:
print("✅ Found index(es) on payment.customer_id:")
for index_name, index_def in indexes:
print(f" - {index_name}: {index_def}")
else:
print("❌ No index found on payment.customer_id column")
conn.close()
if has_index:
print(f"\n🎉 Task verification: PASS")
print(f" - Index on payment.customer_id exists")
sys.exit(0)
else:
print(f"\n❌ Task verification: FAIL")
print(f" - No index found on payment.customer_id")
print(f" - Create an index on payment(customer_id) to optimize the queries")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/employees/department_summary_view/description.md
================================================
Create an executive department summary view to provide quick insights into departmental metrics for leadership dashboards. This view will consolidate key department statistics in one easily accessible place.
## Your Task:
**Create the executive department summary view** — build a materialized view called `exec_department_summary` in the `employees` schema with these exact columns:
* `department_name` (varchar) — department name
* `total_employees` (integer) — current active employee count (employees with active salary where to_date = '9999-01-01')
* `avg_salary` (decimal) — average current salary for active employees
* `total_payroll` (bigint) — total monthly payroll cost (sum of all current salaries in the department)
* `manager_name` (varchar) — current department manager's full name (first_name and last_name concatenated)
## Requirements:
1. Use materialized view to cache results for better performance
2. Join the following tables:
- `departments` - for department information
- `dept_emp` - for employee-department relationships
- `employees` - for employee details
- `salaries` - for current salary information
- `dept_manager` - for current manager information
3. Only include current active employees (those with to_date = '9999-01-01' in both `dept_emp` and `salaries`)
4. Only include current managers (to_date = '9999-01-01' in `dept_manager`)
5. Order results by department_name
## After Creation:
Refresh the materialized view to populate it with current data.
This view will provide executives with a real-time snapshot of departmental workforce metrics and costs.
================================================
FILE: tasks/postgres/easy/employees/department_summary_view/meta.json
================================================
{
"task_id": "department_summary_view",
"task_name": "Department Summary View",
"category_id": "employees",
"category_name": "Employees",
"description": "Build the exec_department_summary materialized view showing department name, active headcount, payroll totals, and the manager name.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"reporting and analytics",
"materialized views"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/easy/employees/department_summary_view/verify.py
================================================
"""
Verification script for PostgreSQL Task 6: Reporting and Automation System
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For date types: convert to string for comparison
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif hasattr(actual, 'strftime'): # datetime.date or datetime.datetime
if str(actual) != str(expected):
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_materialized_views(conn) -> bool:
"""Verify that materialized views were created and populated correctly."""
with conn.cursor() as cur:
# Check all departments' data accuracy
cur.execute("""
SELECT department_name, total_employees, avg_salary, total_payroll, manager_name
FROM employees.exec_department_summary
ORDER BY department_name
""")
view_data = cur.fetchall()
# Get actual data for all departments
cur.execute("""
WITH current_salary AS (
SELECT employee_id, amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (
PARTITION BY s.employee_id
ORDER BY s.from_date DESC, s.amount DESC
) AS rn
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
current_dept AS (
SELECT DISTINCT de.employee_id, de.department_id
FROM employees.department_employee de
WHERE de.to_date = DATE '9999-01-01'
),
current_manager AS (
SELECT department_id,
CONCAT(e.first_name, ' ', e.last_name) AS manager_name
FROM (
SELECT dm.*,
ROW_NUMBER() OVER (
PARTITION BY dm.department_id
ORDER BY dm.from_date DESC, dm.employee_id
) AS rn
FROM employees.department_manager dm
WHERE dm.to_date = DATE '9999-01-01'
) dm
JOIN employees.employee e ON e.id = dm.employee_id
WHERE dm.rn = 1
)
SELECT
d.dept_name AS department_name,
COUNT(cd.employee_id)::INT AS total_employees,
AVG(cs.amount)::DECIMAL AS avg_salary,
COALESCE(SUM(cs.amount), 0)::BIGINT AS total_payroll,
cm.manager_name
FROM employees.department d
LEFT JOIN current_dept cd ON cd.department_id = d.id
LEFT JOIN current_salary cs ON cs.employee_id = cd.employee_id
LEFT JOIN current_manager cm ON cm.department_id = d.id
GROUP BY d.id, d.dept_name, cm.manager_name
ORDER BY d.dept_name;
""")
actual_data = cur.fetchall()
if len(view_data) != len(actual_data):
print(f"❌ Department count mismatch: view={len(view_data)}, actual={len(actual_data)}")
return False
for view_row, actual_row in zip(view_data, actual_data):
if not rows_match(view_row, actual_row):
print(f"❌ Department summary data incorrect for {view_row[0]}: view={view_row}, actual={actual_row}")
return False
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all components
success = verify_materialized_views(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/employees/employee_gender_statistics/description.md
================================================
Create a gender statistics summary table for the HR team's annual workforce composition report. This is a simple analysis to understand the gender distribution in our employee database.
## Your Task:
**Create the gender statistics table** — build a table called `gender_statistics` in the `employees` schema with these exact columns:
* `gender` (varchar) — gender ('M' or 'F')
* `total_employees` (integer) — total number of employees of this gender
* `current_employees` (integer) — current employees of this gender (have active salary where to_date = '9999-01-01')
* `percentage_of_workforce` (decimal) — percentage of current workforce (current_employees / total current employees * 100)
## Requirements:
1. Calculate total employees by counting all employees of each gender from the `employees` table
2. Calculate current employees by counting employees with active salary records (to_date = '9999-01-01' in the `salaries` table)
3. Calculate the percentage based on current workforce only
4. The table should contain exactly 2 rows (one for 'M' and one for 'F')
This analysis will help HR understand the basic gender composition of our workforce for diversity reporting.
================================================
FILE: tasks/postgres/easy/employees/employee_gender_statistics/meta.json
================================================
{
"task_id": "employee_gender_statistics",
"task_name": "Employee Gender Statistics",
"category_id": "employees",
"category_name": "Employees",
"description": "Aggregate the employees dataset into a gender_statistics table with counts of total/current staff by gender plus workforce percentage.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"reporting and analytics",
"data aggregation"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/easy/employees/employee_gender_statistics/verify.py
================================================
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_gender_statistics_results(conn) -> bool:
"""Verify the gender statistics results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT gender, total_employees, current_employees, percentage_of_workforce
FROM employees.gender_statistics
ORDER BY gender
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH current_emp AS (
SELECT DISTINCT s.employee_id
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
),
total_current AS (
SELECT COUNT(*) AS cnt
FROM current_emp
)
SELECT
e.gender::varchar AS gender,
COUNT(*) AS total_employees,
COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL) AS current_employees,
(COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL
/ NULLIF((SELECT cnt FROM total_current), 0) * 100 AS percentage_of_workforce
FROM employees.employee e
LEFT JOIN current_emp ce ON ce.employee_id = e.id
WHERE e.gender IN ('M','F')
GROUP BY e.gender
ORDER BY gender;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} gender statistics results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Gender statistics results are correct ({len(actual_results)} records)")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all four analysis results
success = verify_gender_statistics_results(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/employees/employee_projects_basic/description.md
================================================
Create and manage a basic employee projects table to track company projects. The IT team needs you to build the database table structure and populate it with initial project data.
## Your Tasks:
1. **Create the employee_projects table** — build a new table in the `employees` schema:
**Table: `employee_projects`**
* `project_id` (integer, primary key, auto-increment)
* `project_name` (varchar(100), not null)
* `start_date` (date, not null)
* `end_date` (date)
* `budget` (decimal(10,2))
* `status` (varchar(20), default 'active')
2. **Insert exactly this initial data into `employee_projects`**:
* Project 1: name='Database Modernization', start_date='2024-01-15', end_date='2024-06-30', budget=250000.00, status='active'
* Project 2: name='Employee Portal Upgrade', start_date='2024-02-01', end_date='2024-05-15', budget=180000.00, status='active'
* Project 3: name='HR Analytics Dashboard', start_date='2023-11-01', end_date='2024-01-31', budget=120000.00, status='active'
This will establish the basic project tracking foundation for the company.
================================================
FILE: tasks/postgres/easy/employees/employee_projects_basic/meta.json
================================================
{
"task_id": "employee_projects_basic",
"task_name": "Employee Projects Basic",
"category_id": "employees",
"category_name": "Employees",
"description": "Create the employee_projects table with the specified schema and insert the three starter projects for modernization, portal upgrade, and analytics.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"schema design",
"data loading"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/easy/employees/employee_projects_basic/verify.py
================================================
"""
Verification script for PostgreSQL Task 5: Database Schema and Data Operations
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For date types: convert to string for comparison
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif hasattr(actual, 'strftime'): # datetime.date or datetime.datetime
if str(actual) != str(expected):
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_project_data(conn) -> bool:
"""Verify that project data was inserted and updated correctly."""
with conn.cursor() as cur:
# Check project data after updates
cur.execute("""
SELECT project_name, start_date, end_date, budget, status
FROM employees.employee_projects
ORDER BY project_name
""")
projects = cur.fetchall()
if len(projects) != 3:
print(f"❌ Expected 3 projects, found {len(projects)}")
return False
# Expected final state after all updates
expected = {
'Database Modernization': ('2024-01-15', '2024-06-30', 250000.00, 'active'),
'Employee Portal Upgrade': ('2024-02-01', '2024-05-15', 180000.00, 'active'),
'HR Analytics Dashboard': ('2023-11-01', '2024-01-31', 120000.00, 'active')
}
for project in projects:
name = project[0]
if name not in expected:
print(f"❌ Unexpected project: {name}")
return False
exp = expected[name]
# Use rows_match for comparison
expected_row = (name,) + exp
if not rows_match(project, expected_row):
print(f"❌ Project {name} data mismatch: expected {expected_row}, got {project}")
return False
print("✅ Project data is correct")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all components
success = verify_project_data(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/employees/hiring_year_summary/description.md
================================================
Create a hiring year summary table to help HR track employee retention trends over the years. This analysis shows how many employees were hired each year and how many are still with the company.
## Your Task:
**Create the hiring year summary table** — build a table called `hiring_year_summary` in the `employees` schema with these exact columns:
* `hire_year` (integer) — year employees were hired
* `employees_hired` (integer) — number of employees hired that year
* `still_employed` (integer) — how many from that year are still employed (have active salary where to_date = '9999-01-01')
* `retention_rate` (decimal) — percentage still employed (still_employed / employees_hired * 100)
## Requirements:
1. Extract the hire year from the `hire_date` column in the `employees` table
2. Count total employees hired in each year
3. Determine which employees are still employed by checking for active salary records (to_date = '9999-01-01' in the `salaries` table)
4. Order results by hire_year in ascending order
This analysis will help HR understand retention patterns and identify years with particularly high or low retention rates.
================================================
FILE: tasks/postgres/easy/employees/hiring_year_summary/meta.json
================================================
{
"task_id": "hiring_year_summary",
"task_name": "Hiring Year Summary",
"category_id": "employees",
"category_name": "Employees",
"description": "Summarize hires per year into hiring_year_summary, including still-employed counts and retention percentages using active salary rows.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"reporting and analytics",
"retention analysis"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/easy/employees/hiring_year_summary/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Employee Demographics Report
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_hiring_year_results(conn) -> bool:
"""Verify the hiring year summary results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT hire_year, employees_hired, still_employed, retention_rate
FROM employees.hiring_year_summary
ORDER BY hire_year
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH current_emp AS (
SELECT DISTINCT s.employee_id
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
),
base AS (
SELECT e.id, EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year
FROM employees.employee e
WHERE e.hire_date IS NOT NULL
)
SELECT
b.hire_year,
COUNT(*)::INT AS employees_hired,
COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL)::INT AS still_employed,
(COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL
/ NULLIF(COUNT(*), 0) * 100 AS retention_rate
FROM base b
LEFT JOIN current_emp ce ON ce.employee_id = b.id
GROUP BY b.hire_year
ORDER BY b.hire_year;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} hiring year results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Hiring year summary results are correct ({len(actual_results)} records)")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all four analysis results
success = verify_hiring_year_results(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/lego/basic_security_setup/description.md
================================================
Set up basic database security with role-based access control and Row-Level Security (RLS) for the LEGO database.
## Your Tasks:
### 1. Create Database Role and Permissions
Create a new database role called `theme_analyst` with the following permissions:
* `SELECT` permissions on all reference tables: `lego_themes`, `lego_colors`, `lego_parts`, `lego_part_categories`
* `SELECT` permissions on main data tables: `lego_sets`, `lego_inventories`, `lego_inventory_parts`
* No `INSERT`, `UPDATE`, or `DELETE` permissions on any tables
### 2. Enable Row-Level Security
Enable RLS on the following tables:
* `lego_sets`
* `lego_inventories`
* `lego_inventory_parts`
## Requirements:
- Use `CREATE ROLE` to create the `theme_analyst` role
- Use `GRANT SELECT` statements to assign the appropriate permissions
- Use `ALTER TABLE ... ENABLE ROW LEVEL SECURITY` to enable RLS on each table
## Expected Outcome:
After completing these tasks:
- The `theme_analyst` role should exist with read-only access to specified tables
- Row-Level Security should be enabled (but not yet enforced with policies) on the three main data tables
- The role should have no write permissions on any table
This sets up the foundation for implementing theme-based data isolation policies.
================================================
FILE: tasks/postgres/easy/lego/basic_security_setup/meta.json
================================================
{
"task_id": "basic_security_setup",
"task_name": "Basic Security Setup",
"category_id": "lego",
"category_name": "Lego",
"description": "Create the read-only theme_analyst role with SELECT rights on LEGO reference tables and enable row-level security on sets and inventory tables.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"security",
"access control"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
}
}
================================================
FILE: tasks/postgres/easy/lego/basic_security_setup/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 4: Database Security and RLS Implementation
(Version 2 - Improved Robustness)
"""
import os
import sys
import psycopg2
import psycopg2.errors
from typing import Dict
def get_connection_params() -> Dict[str, any]:
"""Get database connection parameters from environment variables."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD"),
}
def verify_role_creation(conn) -> bool:
"""
TASK 1 VERIFICATION: Check if theme_analyst role was created with proper permissions.
"""
print("\n-- Verifying Task 1: Role Creation and Permissions --")
with conn.cursor() as cur:
# Check if role exists
cur.execute("SELECT 1 FROM pg_roles WHERE rolname = 'theme_analyst';")
if not cur.fetchone():
print("❌ FAIL: The 'theme_analyst' role was not created.")
return False
print("✅ OK: Role 'theme_analyst' exists.")
# Check SELECT permissions on reference and main tables
all_tables = [
'lego_themes', 'lego_colors', 'lego_parts', 'lego_part_categories',
'lego_sets', 'lego_inventories', 'lego_inventory_parts'
]
for table in all_tables:
cur.execute(
"""
SELECT has_table_privilege('theme_analyst', %s, 'SELECT');
""",
(table,)
)
if not cur.fetchone()[0]:
print(f"❌ FAIL: 'theme_analyst' role is missing SELECT permission on '{table}'.")
return False
print("✅ OK: Role has correct SELECT permissions on all required tables.")
# Check that no INSERT/UPDATE/DELETE permissions exist
for table in all_tables:
cur.execute(
"""
SELECT
has_table_privilege('theme_analyst', %s, 'INSERT') OR
has_table_privilege('theme_analyst', %s, 'UPDATE') OR
has_table_privilege('theme_analyst', %s, 'DELETE');
""",
(table, table, table)
)
if cur.fetchone()[0]:
print(f"❌ FAIL: 'theme_analyst' role has unauthorized INSERT, UPDATE, or DELETE permission on '{table}'.")
return False
print("✅ OK: Role does not have modification permissions.")
print("✅ PASS: 'theme_analyst' role created with correct permissions.")
return True
def verify_rls_enabled(conn) -> bool:
"""
TASK 2 VERIFICATION: Check if Row-Level Security is enabled on required tables.
"""
print("\n-- Verifying Task 2: Row-Level Security Enablement --")
tables_to_check = ['lego_sets', 'lego_inventories', 'lego_inventory_parts']
with conn.cursor() as cur:
for table in tables_to_check:
cur.execute(
"SELECT relrowsecurity FROM pg_class WHERE relname = %s;", (table,)
)
rls_enabled = cur.fetchone()
if not rls_enabled or not rls_enabled[0]:
print(f"❌ FAIL: RLS is not enabled on table '{table}'.")
return False
print(f"✅ OK: RLS is enabled on table '{table}'.")
print("✅ PASS: Row-Level Security is enabled on all required tables.")
return True
def main():
"""Main verification function."""
print("=" * 60)
print("LEGO Database Security and RLS Verification Script")
print("=" * 60)
conn_params = get_connection_params()
if not conn_params.get("database"):
print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
sys.exit(1)
conn = None
try:
conn = psycopg2.connect(**conn_params)
results = [
verify_role_creation(conn),
verify_rls_enabled(conn),
]
if all(results):
print("\n🎉 Overall Result: PASS - All security tasks verified successfully!")
sys.exit(0)
else:
print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
sys.exit(1)
except psycopg2.OperationalError as e:
print(f"❌ CRITICAL: Could not connect to the database. Check credentials and host. Details: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}")
sys.exit(1)
finally:
if conn:
conn.close()
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/description.md
================================================
Fix data inconsistencies in the LEGO database where the reported part count in the `lego_sets` table does not match the actual sum of non-spare parts in the latest inventory version.
## Consistency Rule
For any given `set_num`, the following must be true:
`lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false
**Important**: If a set has no inventory records, the consistency check should be skipped.
## Your Tasks:
### Task 1: Identify Data Inconsistencies
**Objective**: Write a single `SELECT` query to find all sets where the stored `num_parts` does not match the actual calculated number of parts from the latest inventory.
1. **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table.
2. **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false.
3. **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum.
### Task 2: Fix Existing Inconsistencies
**Objective**: Correct all mismatched `num_parts` values using a clear, multi-step process with a temporary table.
#### Step 1: Create a Temporary Table
Create a temporary table (e.g., `correct_counts`) with two columns: `set_num` (text) and `actual_parts` (integer).
#### Step 2: Populate the Temporary Table
Write an `INSERT` statement that calculates the correct part count for every single set listed in the `lego_sets` table.
- The query must start by selecting from `public.lego_sets`.
- It must then `LEFT JOIN` to a subquery that contains the part-counting logic (finding the latest inventory version and summing the non-spare parts).
- Use `COALESCE` on the final result from the subquery to ensure that any set without parts or without an inventory record gets a value of `0`, not `NULL`.
#### Step 3: Update from the Temporary Table
Write a final, simple `UPDATE` statement that joins the `lego_sets` table with your temporary table on `set_num` and sets `num_parts` to the `actual_parts` value.
## Expected Outcome:
After completing these tasks, all sets in the `lego_sets` table should have their `num_parts` correctly reflecting the sum of non-spare parts from their latest inventory version.
================================================
FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/meta.json
================================================
{
"task_id": "fix_data_inconsistencies",
"task_name": "Fix Data Inconsistencies",
"category_id": "lego",
"category_name": "Lego",
"description": "Recalculate each LEGO set's part count from the latest inventory, stage the results, and update lego_sets.num_parts to remove mismatches.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"data integrity enforcement",
"data reconciliation"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
}
}
================================================
FILE: tasks/postgres/easy/lego/fix_data_inconsistencies/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints
Version 2.1: Relaxed consistency check to allow for one known corner case mismatch.
"""
import os
import sys
import psycopg2
import psycopg2.errors
from typing import Optional, Tuple, List
def get_connection_params() -> dict:
"""Get database connection parameters from environment variables."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD"),
}
def fetch_candidate_part_row(cur) -> Optional[Tuple[int, str, str, int]]:
"""
Picks a concrete, non-spare inventory part from the latest inventory of any set.
This provides a reliable target for testing update and insert triggers.
Returns a tuple: (inventory_id, set_num, part_num, color_id) or None.
"""
cur.execute(
"""
WITH latest_inv AS (
SELECT set_num, MAX(version) AS max_version
FROM public.lego_inventories
GROUP BY set_num
), inv AS (
SELECT li.id, li.set_num
FROM public.lego_inventories li
JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version
)
SELECT i.id AS inventory_id, i.set_num, lip.part_num, lip.color_id
FROM inv i
JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id
WHERE lip.is_spare = false AND lip.quantity > 0
LIMIT 1;
"""
)
return cur.fetchone()
def get_mismatch_count(cur) -> int:
"""Returns the number of sets where num_parts mismatches the computed actual sum."""
cur.execute(
"""
WITH latest_inv AS (
SELECT set_num, MAX(version) AS max_version
FROM public.lego_inventories
GROUP BY set_num
), inv_latest AS (
SELECT li.set_num, li.id
FROM public.lego_inventories li
JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version
), parts_agg AS (
SELECT
i.set_num,
SUM(lip.quantity) AS actual_parts
FROM inv_latest i
JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id
WHERE lip.is_spare = false
GROUP BY i.set_num
)
SELECT COUNT(*)
FROM public.lego_sets s
LEFT JOIN parts_agg pa ON s.set_num = pa.set_num
WHERE s.num_parts <> COALESCE(pa.actual_parts, 0);
"""
)
return cur.fetchone()[0]
def verify_data_consistency(conn) -> bool:
"""
TASK 1 VERIFICATION: Checks if the initial data fix was successful.
(Relaxed: Allows for one corner-case mismatch).
"""
print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --")
with conn.cursor() as cur:
count = get_mismatch_count(cur)
# RELAXED CONDITION: Allow 0 or 1 mismatch to pass.
if count > 1:
print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.")
return False
print("✅ PASS: Data consistency check passed (allowing for one known mismatch).")
return True
def main():
"""Main verification function."""
print("=" * 60)
print("LEGO Database Consistency Verification Script")
print("=" * 60)
conn_params = get_connection_params()
if not conn_params.get("database"):
print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
sys.exit(1)
try:
with psycopg2.connect(**conn_params) as conn:
conn.autocommit = False # Ensure we control transactions
# Run all verification steps
results = [
verify_data_consistency(conn),
]
if all(results):
print("\n🎉 Overall Result: PASS - All tasks verified successfully!")
sys.exit(0)
else:
print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
sys.exit(1)
except psycopg2.OperationalError as e:
print(f"❌ CRITICAL: Could not connect to the database. Details: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ CRITICAL: An unexpected error occurred during verification. Details: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/easy/sports/create_performance_indexes/description.md
================================================
Create indexes to optimize participant and statistics queries in the sports database.
## Your Task:
Create two indexes to improve query performance:
1. **Index on participants_events table**: Create an index on the `participant_id` column of the `participants_events` table
2. **Composite index on stats table**: Create a composite index on the `stats` table using columns `stat_holder_type` and `stat_holder_id` (in that order)
## Requirements:
- Create an index on `participants_events(participant_id)`
- Create a composite index on `stats(stat_holder_type, stat_holder_id)`
- Index names can be anything you choose (e.g., `idx_participants_events_participant_id`, `idx_stats_holder`)
- Use the standard CREATE INDEX syntax
## Expected Outcome:
After creating these indexes, queries that involve participant filtering and statistics lookups will run significantly faster.
================================================
FILE: tasks/postgres/easy/sports/create_performance_indexes/meta.json
================================================
{
"task_id": "create_performance_indexes",
"task_name": "Create Performance Indexes",
"category_id": "sports",
"category_name": "Sports",
"description": "Create indexes on participants_events.participant_id and stats(stat_holder_type, stat_holder_id) to accelerate performance reporting.",
"author": "Lingxiao Du",
"created_at": "2025-11-15",
"difficulty": "L1",
"tags": [
"performance optimization",
"indexing"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"addresses\" {\n \"id\" int4 [not null, increment]\n \"location_id\" int4 [not null]\n \"language\" varchar(100)\n \"suite\" varchar(100)\n \"floor\" varchar(100)\n \"building\" varchar(100)\n \"street_number\" varchar(100)\n \"street_prefix\" varchar(100)\n \"street\" varchar(100)\n \"street_suffix\" varchar(100)\n \"neighborhood\" varchar(100)\n \"district\" varchar(100)\n \"locality\" varchar(100)\n \"county\" varchar(100)\n \"region\" varchar(100)\n \"postal_code\" varchar(100)\n \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"ancestor_affiliation_id\" int4\n \"start_season_id\" int4\n \"start_date_time\" timestamp\n \"end_season_id\" int4\n \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n \"id\" int4 [not null, increment]\n \"affiliation_key\" varchar(100) [not null]\n \"affiliation_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n \"affiliation_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n \"affiliation_id\" int4 [not null]\n \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n \"affiliation_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n \"id\" int4 [not null, increment]\n \"american_football_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"score_type\" varchar(100)\n \"field_line\" int4\n \"yardage\" int4\n \"score_credit\" int4\n \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n \"id\" int4 [not null, increment]\n \"american_football_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"drive_result\" varchar(100)\n \"points\" int4\n \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"tackles_total\" varchar(100)\n \"tackles_solo\" varchar(100)\n \"tackles_assists\" varchar(100)\n \"interceptions_total\" varchar(100)\n \"interceptions_yards\" varchar(100)\n \"interceptions_average\" varchar(100)\n \"interceptions_longest\" varchar(100)\n \"interceptions_touchdown\" varchar(100)\n \"quarterback_hurries\" varchar(100)\n \"sacks_total\" varchar(100)\n \"sacks_yards\" varchar(100)\n \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n \"id\" int4 [not null, increment]\n \"first_downs_total\" varchar(100)\n \"first_downs_pass\" varchar(100)\n \"first_downs_run\" varchar(100)\n \"first_downs_penalty\" varchar(100)\n \"conversions_third_down\" varchar(100)\n \"conversions_third_down_attempts\" varchar(100)\n \"conversions_third_down_percentage\" varchar(100)\n \"conversions_fourth_down\" varchar(100)\n \"conversions_fourth_down_attempts\" varchar(100)\n \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"period_value\" int4\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"clock_state\" varchar(100)\n \"down\" int4\n \"team_in_possession_id\" int4\n \"distance_for_1st_down\" int4\n \"field_side\" varchar(100)\n \"field_line\" int4\n \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n \"id\" int4 [not null, increment]\n \"fumbles_committed\" varchar(100)\n \"fumbles_forced\" varchar(100)\n \"fumbles_recovered\" varchar(100)\n \"fumbles_lost\" varchar(100)\n \"fumbles_yards_gained\" varchar(100)\n \"fumbles_own_committed\" varchar(100)\n \"fumbles_own_recovered\" varchar(100)\n \"fumbles_own_lost\" varchar(100)\n \"fumbles_own_yards_gained\" varchar(100)\n \"fumbles_opposing_committed\" varchar(100)\n \"fumbles_opposing_recovered\" varchar(100)\n \"fumbles_opposing_lost\" varchar(100)\n \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"offensive_plays_yards\" varchar(100)\n \"offensive_plays_number\" varchar(100)\n \"offensive_plays_average_yards_per\" varchar(100)\n \"possession_duration\" varchar(100)\n \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n \"id\" int4 [not null, increment]\n \"passes_attempts\" varchar(100)\n \"passes_completions\" varchar(100)\n \"passes_percentage\" varchar(100)\n \"passes_yards_gross\" varchar(100)\n \"passes_yards_net\" varchar(100)\n \"passes_yards_lost\" varchar(100)\n \"passes_touchdowns\" varchar(100)\n \"passes_touchdowns_percentage\" varchar(100)\n \"passes_interceptions\" varchar(100)\n \"passes_interceptions_percentage\" varchar(100)\n \"passes_longest\" varchar(100)\n \"passes_average_yards_per\" varchar(100)\n \"passer_rating\" varchar(100)\n \"receptions_total\" varchar(100)\n \"receptions_yards\" varchar(100)\n \"receptions_touchdowns\" varchar(100)\n \"receptions_first_down\" varchar(100)\n \"receptions_longest\" varchar(100)\n \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n \"id\" int4 [not null, increment]\n \"penalties_total\" varchar(100)\n \"penalty_yards\" varchar(100)\n \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n \"id\" int4 [not null, increment]\n \"rushes_attempts\" varchar(100)\n \"rushes_yards\" varchar(100)\n \"rushes_touchdowns\" varchar(100)\n \"rushing_average_yards_per\" varchar(100)\n \"rushes_first_down\" varchar(100)\n \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n \"id\" int4 [not null, increment]\n \"sacks_against_yards\" varchar(100)\n \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n \"id\" int4 [not null, increment]\n \"touchdowns_total\" varchar(100)\n \"touchdowns_passing\" varchar(100)\n \"touchdowns_rushing\" varchar(100)\n \"touchdowns_special_teams\" varchar(100)\n \"touchdowns_defensive\" varchar(100)\n \"extra_points_attempts\" varchar(100)\n \"extra_points_made\" varchar(100)\n \"extra_points_missed\" varchar(100)\n \"extra_points_blocked\" varchar(100)\n \"field_goal_attempts\" varchar(100)\n \"field_goals_made\" varchar(100)\n \"field_goals_missed\" varchar(100)\n \"field_goals_blocked\" varchar(100)\n \"safeties_against\" varchar(100)\n \"two_point_conversions_attempts\" varchar(100)\n \"two_point_conversions_made\" varchar(100)\n \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_punt_total\" varchar(100)\n \"returns_punt_yards\" varchar(100)\n \"returns_punt_average\" varchar(100)\n \"returns_punt_longest\" varchar(100)\n \"returns_punt_touchdown\" varchar(100)\n \"returns_kickoff_total\" varchar(100)\n \"returns_kickoff_yards\" varchar(100)\n \"returns_kickoff_average\" varchar(100)\n \"returns_kickoff_longest\" varchar(100)\n \"returns_kickoff_touchdown\" varchar(100)\n \"returns_total\" varchar(100)\n \"returns_yards\" varchar(100)\n \"punts_total\" varchar(100)\n \"punts_yards_gross\" varchar(100)\n \"punts_yards_net\" varchar(100)\n \"punts_longest\" varchar(100)\n \"punts_inside_20\" varchar(100)\n \"punts_inside_20_percentage\" varchar(100)\n \"punts_average\" varchar(100)\n \"punts_blocked\" varchar(100)\n \"touchbacks_total\" varchar(100)\n \"touchbacks_total_percentage\" varchar(100)\n \"touchbacks_kickoffs\" varchar(100)\n \"touchbacks_kickoffs_percentage\" varchar(100)\n \"touchbacks_punts\" varchar(100)\n \"touchbacks_punts_percentage\" varchar(100)\n \"touchbacks_interceptions\" varchar(100)\n \"touchbacks_interceptions_percentage\" varchar(100)\n \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_pitch_id\" int4 [not null]\n \"location\" varchar(100)\n \"strength\" varchar(100)\n \"velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_play_id\" int4 [not null]\n \"sequence_number\" int4\n \"baseball_defensive_group_id\" int4\n \"umpire_call\" varchar(100)\n \"pitch_location\" varchar(100)\n \"pitch_type\" varchar(100)\n \"pitch_velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n \"ball_type\" varchar(40)\n \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"notation\" varchar(100)\n \"notation_yaml\" text\n \"baseball_defensive_group_id\" int4\n \"comment\" varchar(255)\n \"runner_on_first_advance\" int4\n \"runner_on_second_advance\" int4\n \"runner_on_third_advance\" int4\n \"outs_recorded\" int4\n \"rbi\" int4\n \"runs_scored\" int4\n \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"sequence_number\" int4\n \"person_type\" varchar(100)\n \"person_original_id\" int4\n \"person_original_position_id\" int4\n \"person_original_lineup_slot\" int4\n \"person_replacing_id\" int4\n \"person_replacing_position_id\" int4\n \"person_replacing_lineup_slot\" int4\n \"substitution_reason\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n \"id\" int4 [not null, increment]\n \"baseball_defensive_group_id\" int4 [not null]\n \"player_id\" int4 [not null]\n \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"double_plays\" int4\n \"triple_plays\" int4\n \"putouts\" int4\n \"assists\" int4\n \"errors\" int4\n \"fielding_percentage\" numeric\n \"defensive_average\" numeric\n \"errors_passed_ball\" int4\n \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"at_bat_number\" int4\n \"inning_value\" int4\n \"inning_half\" varchar(100)\n \"outs\" int4\n \"balls\" int4\n \"strikes\" int4\n \"runner_on_first_id\" int4\n \"runner_on_second_id\" int4\n \"runner_on_third_id\" int4\n \"runner_on_first\" int2\n \"runner_on_second\" int2\n \"runner_on_third\" int2\n \"runs_this_inning_half\" int4\n \"pitcher_id\" int4\n \"batter_id\" int4\n \"batter_side\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"average\" numeric\n \"runs_scored\" int4\n \"at_bats\" int4\n \"hits\" int4\n \"rbi\" int4\n \"total_bases\" int4\n \"slugging_percentage\" numeric\n \"bases_on_balls\" int4\n \"strikeouts\" int4\n \"left_on_base\" int4\n \"left_in_scoring_position\" int4\n \"singles\" int4\n \"doubles\" int4\n \"triples\" int4\n \"home_runs\" int4\n \"grand_slams\" int4\n \"at_bats_per_rbi\" numeric\n \"plate_appearances_per_rbi\" numeric\n \"at_bats_per_home_run\" numeric\n \"plate_appearances_per_home_run\" numeric\n \"sac_flies\" int4\n \"sac_bunts\" int4\n \"grounded_into_double_play\" int4\n \"moved_up\" int4\n \"on_base_percentage\" numeric\n \"stolen_bases\" int4\n \"stolen_bases_caught\" int4\n \"stolen_bases_average\" numeric\n \"hit_by_pitch\" int4\n \"defensive_interferance_reaches\" int4\n \"on_base_plus_slugging\" numeric\n \"plate_appearances\" int4\n \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n \"id\" int4 [not null, increment]\n \"runs_allowed\" int4\n \"singles_allowed\" int4\n \"doubles_allowed\" int4\n \"triples_allowed\" int4\n \"home_runs_allowed\" int4\n \"innings_pitched\" varchar(20)\n \"hits\" int4\n \"earned_runs\" int4\n \"unearned_runs\" int4\n \"bases_on_balls\" int4\n \"bases_on_balls_intentional\" int4\n \"strikeouts\" int4\n \"strikeout_to_bb_ratio\" numeric\n \"number_of_pitches\" int4\n \"era\" numeric\n \"inherited_runners_scored\" int4\n \"pick_offs\" int4\n \"errors_hit_with_pitch\" int4\n \"errors_wild_pitch\" int4\n \"balks\" int4\n \"wins\" int4\n \"losses\" int4\n \"saves\" int4\n \"shutouts\" int4\n \"games_complete\" int4\n \"games_finished\" int4\n \"winning_percentage\" numeric\n \"event_credit\" varchar(40)\n \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"steals_total\" varchar(100)\n \"steals_per_game\" varchar(100)\n \"blocks_total\" varchar(100)\n \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"field_goals_made\" int4\n \"field_goals_attempted\" int4\n \"field_goals_percentage\" varchar(100)\n \"field_goals_per_game\" varchar(100)\n \"field_goals_attempted_per_game\" varchar(100)\n \"field_goals_percentage_adjusted\" varchar(100)\n \"three_pointers_made\" int4\n \"three_pointers_attempted\" int4\n \"three_pointers_percentage\" varchar(100)\n \"three_pointers_per_game\" varchar(100)\n \"three_pointers_attempted_per_game\" varchar(100)\n \"free_throws_made\" varchar(100)\n \"free_throws_attempted\" varchar(100)\n \"free_throws_percentage\" varchar(100)\n \"free_throws_per_game\" varchar(100)\n \"free_throws_attempted_per_game\" varchar(100)\n \"points_scored_total\" varchar(100)\n \"points_scored_per_game\" varchar(100)\n \"assists_total\" varchar(100)\n \"assists_per_game\" varchar(100)\n \"turnovers_total\" varchar(100)\n \"turnovers_per_game\" varchar(100)\n \"points_scored_off_turnovers\" varchar(100)\n \"points_scored_in_paint\" varchar(100)\n \"points_scored_on_second_chance\" varchar(100)\n \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n \"id\" int4 [not null, increment]\n \"rebounds_total\" varchar(100)\n \"rebounds_per_game\" varchar(100)\n \"rebounds_defensive\" varchar(100)\n \"rebounds_offensive\" varchar(100)\n \"team_rebounds_total\" varchar(100)\n \"team_rebounds_per_game\" varchar(100)\n \"team_rebounds_defensive\" varchar(100)\n \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n \"id\" int4 [not null, increment]\n \"timeouts_left\" varchar(100)\n \"largest_lead\" varchar(100)\n \"fouls_total\" varchar(100)\n \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n \"id\" int4 [not null, increment]\n \"time_played_event\" varchar(40)\n \"time_played_total\" varchar(40)\n \"time_played_event_average\" varchar(40)\n \"events_played\" int4\n \"events_started\" int4\n \"position_id\" int4\n}\n\nTable \"core_stats\" {\n \"id\" int4 [not null, increment]\n \"score\" varchar(100)\n \"score_opposing\" varchar(100)\n \"score_attempts\" varchar(100)\n \"score_attempts_opposing\" varchar(100)\n \"score_percentage\" varchar(100)\n \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n \"id\" int4 [not null, increment]\n \"language\" varchar(100) [not null]\n \"entity_type\" varchar(100) [not null]\n \"entity_id\" int4 [not null]\n \"full_name\" varchar(100)\n \"first_name\" varchar(100)\n \"middle_name\" varchar(100)\n \"last_name\" varchar(100)\n \"alias\" varchar(100)\n \"abbreviation\" varchar(100)\n \"short_name\" varchar(100)\n \"prefix\" varchar(20)\n \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n \"id\" int4 [not null, increment]\n \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"sportsml\" varchar(200)\n \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n \"id\" int4 [not null, increment]\n \"fixture_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"name\" varchar(100)\n \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n \"id\" int4 [not null, increment]\n \"document_fixture_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"latest_document_id\" int4 [not null]\n \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n \"id\" int4 [not null, increment]\n \"document_package_id\" int4 [not null]\n \"rank\" varchar(100)\n \"document_id\" int4 [not null]\n \"headline\" varchar(100)\n \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n \"id\" int4 [not null, increment]\n \"package_key\" varchar(100)\n \"package_name\" varchar(100)\n \"date_time\" date\n}\n\nTable \"documents\" {\n \"id\" int4 [not null, increment]\n \"doc_id\" varchar(75) [not null]\n \"publisher_id\" int4 [not null]\n \"date_time\" timestamp\n \"title\" varchar(255)\n \"language\" varchar(100)\n \"priority\" varchar(100)\n \"revision_id\" varchar(75)\n \"stats_coverage\" varchar(100)\n \"document_fixture_id\" int4 [not null]\n \"source_id\" int4\n \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"media_id\" int4 [not null]\n \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n \"id\" int4 [not null, increment]\n \"event_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"site_id\" int4\n \"site_alignment\" varchar(100)\n \"event_status\" varchar(100)\n \"duration\" varchar(100)\n \"attendance\" varchar(100)\n \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n \"event_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n \"event_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n \"event_id\" int4 [not null]\n \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"play_result\" varchar(100)\n \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_power_play_allowed\" varchar(100)\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_power_play_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"penalty_killing_amount\" varchar(100)\n \"penalty_killing_percentage\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"takeaways\" varchar(100)\n \"shutouts\" varchar(100)\n \"minutes_penalty_killing\" varchar(100)\n \"hits\" varchar(100)\n \"goals_empty_net_allowed\" varchar(100)\n \"goals_short_handed_allowed\" varchar(100)\n \"goals_shootout_allowed\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_power_play\" varchar(100)\n \"goals_short_handed\" varchar(100)\n \"goals_even_strength\" varchar(100)\n \"goals_empty_net\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_penalty_shot\" varchar(100)\n \"assists\" varchar(100)\n \"points\" varchar(100)\n \"power_play_amount\" varchar(100)\n \"power_play_percentage\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(100)\n \"shots_penalty_shot_percentage\" varchar(100)\n \"giveaways\" varchar(100)\n \"minutes_power_play\" varchar(100)\n \"faceoff_wins\" varchar(100)\n \"faceoff_losses\" varchar(100)\n \"faceoff_win_percentage\" varchar(100)\n \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n \"id\" int4 [not null, increment]\n \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"injury_status\" varchar(100)\n \"injury_type\" varchar(100)\n \"injury_comment\" varchar(100)\n \"disabled_list\" varchar(100)\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n \"season_id\" int4\n \"phase_type\" varchar(100)\n \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n \"id\" int4 [not null, increment]\n \"key_id\" int4 [not null]\n \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n \"id\" int4 [not null, increment]\n \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n \"id\" int4 [not null, increment]\n \"revision_id\" varchar(75) [not null]\n \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n \"id\" int4 [not null, increment]\n \"timezone\" varchar(100)\n \"latitude\" varchar(100)\n \"longitude\" varchar(100)\n \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n \"id\" int4 [not null, increment]\n \"object_id\" int4\n \"source_id\" int4\n \"revision_id\" int4\n \"media_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"date_time\" varchar(100)\n \"credit_id\" int4 [not null]\n \"db_loading_date_time\" timestamp\n \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"caption_type\" varchar(100)\n \"caption\" varchar(100)\n \"caption_author_id\" int4 [not null]\n \"language\" varchar(100)\n \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"object\" varchar(100)\n \"format\" varchar(100)\n \"mime_type\" varchar(100)\n \"height\" varchar(100)\n \"width\" varchar(100)\n \"duration\" varchar(100)\n \"file_size\" varchar(100)\n \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n \"id\" int4 [not null, increment]\n \"keyword\" varchar(100)\n \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"lap\" varchar(100)\n \"laps_remaining\" varchar(100)\n \"time_elapsed\" varchar(100)\n \"flag_state\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n \"id\" int4 [not null, increment]\n \"grid\" varchar(100)\n \"pole_position\" varchar(100)\n \"pole_wins\" varchar(100)\n \"qualifying_speed\" varchar(100)\n \"qualifying_speed_units\" varchar(100)\n \"qualifying_time\" varchar(100)\n \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n \"id\" int4 [not null, increment]\n \"time_behind_leader\" varchar(100)\n \"laps_behind_leader\" varchar(100)\n \"time_ahead_follower\" varchar(100)\n \"laps_ahead_follower\" varchar(100)\n \"time\" varchar(100)\n \"points\" varchar(100)\n \"points_rookie\" varchar(100)\n \"bonus\" varchar(100)\n \"laps_completed\" varchar(100)\n \"laps_leading_total\" varchar(100)\n \"distance_leading\" varchar(100)\n \"distance_completed\" varchar(100)\n \"distance_units\" varchar(40)\n \"speed_average\" varchar(40)\n \"speed_units\" varchar(40)\n \"status\" varchar(40)\n \"finishes_top_5\" varchar(40)\n \"finishes_top_10\" varchar(40)\n \"starts\" varchar(40)\n \"finishes\" varchar(40)\n \"non_finishes\" varchar(40)\n \"wins\" varchar(40)\n \"races_leading\" varchar(40)\n \"money\" varchar(40)\n \"money_units\" varchar(40)\n \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n \"id\" int4 [not null, increment]\n \"standing_subgroup_id\" int4 [not null]\n \"outcome_holder_type\" varchar(100)\n \"outcome_holder_id\" int4\n \"rank\" varchar(100)\n \"wins\" varchar(100)\n \"losses\" varchar(100)\n \"ties\" varchar(100)\n \"undecideds\" varchar(100)\n \"winning_percentage\" varchar(100)\n \"points_scored_for\" varchar(100)\n \"points_scored_against\" varchar(100)\n \"points_difference\" varchar(100)\n \"standing_points\" varchar(100)\n \"streak_type\" varchar(100)\n \"streak_duration\" varchar(100)\n \"streak_total\" varchar(100)\n \"streak_start\" date\n \"streak_end\" date\n}\n\nTable \"participants_events\" {\n \"id\" int4 [not null, increment]\n \"participant_type\" varchar(100) [not null]\n \"participant_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"alignment\" varchar(100)\n \"score\" varchar(100)\n \"event_outcome\" varchar(100)\n \"rank\" int4\n}\n\nTable \"periods\" {\n \"id\" int4 [not null, increment]\n \"participant_event_id\" int4 [not null]\n \"period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"status\" varchar(100)\n \"health\" varchar(100)\n \"weight\" varchar(100)\n \"role_id\" int4\n \"position_id\" int4\n \"team_id\" int4\n \"lineup_slot\" int4\n \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"membership_type\" varchar(40) [not null]\n \"membership_id\" int4 [not null]\n \"role_id\" int4\n \"role_status\" varchar(40)\n \"phase_status\" varchar(40)\n \"uniform_number\" varchar(20)\n \"regular_position_id\" int4\n \"regular_position_depth\" varchar(40)\n \"height\" varchar(100)\n \"weight\" varchar(100)\n \"start_date_time\" timestamp\n \"start_season_id\" int4\n \"end_date_time\" timestamp\n \"end_season_id\" int4\n \"entry_reason\" varchar(40)\n \"exit_reason\" varchar(40)\n \"selection_level\" int4\n \"selection_sublevel\" int4\n \"selection_overall\" int4\n}\n\nTable \"persons\" {\n \"id\" int4 [not null, increment]\n \"person_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"gender\" varchar(20)\n \"birth_date\" varchar(30)\n \"death_date\" varchar(30)\n \"birth_location_id\" int4\n \"hometown_location_id\" int4\n \"residence_location_id\" int4\n \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n \"person_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n \"person_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n \"id\" int4 [not null, increment]\n \"publisher_key\" varchar(100) [not null]\n \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n \"id\" int4 [not null, increment]\n \"role_key\" varchar(100) [not null]\n \"role_name\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n \"id\" int4 [not null, increment]\n \"season_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"league_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n \"id\" int4 [not null, increment]\n \"site_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"goals_against_total\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"catches_punches\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_shootout_total\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"minutes_elapsed\" varchar(100)\n \"period_minute_elapsed\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n \"id\" int4 [not null, increment]\n \"fouls_suffered\" varchar(100)\n \"fouls_commited\" varchar(100)\n \"cautions_total\" varchar(100)\n \"cautions_pending\" varchar(100)\n \"caution_points_total\" varchar(100)\n \"caution_points_pending\" varchar(100)\n \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_total\" varchar(100)\n \"assists_game_winning\" varchar(100)\n \"assists_game_tying\" varchar(100)\n \"assists_overtime\" varchar(100)\n \"assists_total\" varchar(100)\n \"points\" varchar(100)\n \"shots_total\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_hit_frame\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_scored\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(40)\n \"shots_penalty_shot_percentage\" varchar(40)\n \"shots_shootout_taken\" varchar(40)\n \"shots_shootout_scored\" varchar(40)\n \"shots_shootout_missed\" varchar(40)\n \"shots_shootout_percentage\" varchar(40)\n \"giveaways\" varchar(40)\n \"offsides\" varchar(40)\n \"corner_kicks\" varchar(40)\n \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n \"id\" int4 [not null, increment]\n \"standing_id\" int4 [not null]\n \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"standing_type\" varchar(100)\n \"sub_season_id\" int4 [not null]\n \"last_updated\" varchar(100)\n \"duration_scope\" varchar(100)\n \"competition_scope\" varchar(100)\n \"competition_scope_id\" varchar(100)\n \"alignment_scope\" varchar(100)\n \"site_scope\" varchar(100)\n \"scoping_label\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"source\" varchar(100)\n}\n\nTable \"stats\" {\n \"id\" int4 [not null, increment]\n \"stat_repository_type\" varchar(100)\n \"stat_repository_id\" int4 [not null]\n \"stat_holder_type\" varchar(100)\n \"stat_holder_id\" int4\n \"stat_coverage_type\" varchar(100)\n \"stat_coverage_id\" int4\n \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n \"id\" int4 [not null, increment]\n \"period_id\" int4 [not null]\n \"sub_period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n \"id\" int4 [not null, increment]\n \"sub_season_key\" varchar(100) [not null]\n \"season_id\" int4 [not null]\n \"sub_season_type\" varchar(100) [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n \"id\" int4 [not null, increment]\n \"yards_per_attempt\" varchar(100)\n \"average_starting_position\" varchar(100)\n \"timeouts\" varchar(100)\n \"time_of_possession\" varchar(100)\n \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n \"id\" int4 [not null, increment]\n \"team_id\" int4 [not null]\n \"start_season_id\" int4\n \"end_season_id\" int4\n \"affiliation_id\" int4 [not null]\n \"start_date_time\" varchar(100)\n \"end_date_time\" varchar(100)\n \"phase_status\" varchar(40)\n \"role_id\" int4\n}\n\nTable \"teams\" {\n \"id\" int4 [not null, increment]\n \"team_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n \"team_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n \"team_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n \"id\" int4 [not null, increment]\n \"sub_period_id\" varchar(100)\n \"sequence_number\" varchar(100)\n \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n \"id\" int4 [not null, increment]\n \"sequence_number\" varchar(100)\n \"tennis_action_points_id\" int4\n \"landing_location\" varchar(100)\n \"swing_type\" varchar(100)\n \"result\" varchar(100)\n \"spin_type\" varchar(100)\n \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"tennis_set\" varchar(100)\n \"game\" varchar(100)\n \"server_person_id\" int4\n \"server_score\" varchar(100)\n \"receiver_person_id\" int4\n \"receiver_score\" varchar(100)\n \"service_number\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"first_service_return_points_won\" varchar(100)\n \"first_service_return_points_won_pct\" varchar(100)\n \"second_service_return_points_won\" varchar(100)\n \"second_service_return_points_won_pct\" varchar(100)\n \"return_games_played\" varchar(100)\n \"return_games_won\" varchar(100)\n \"return_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_converted\" varchar(100)\n \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n \"id\" int4 [not null, increment]\n \"services_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"aces\" varchar(100)\n \"first_services_good\" varchar(100)\n \"first_services_good_pct\" varchar(100)\n \"first_service_points_won\" varchar(100)\n \"first_service_points_won_pct\" varchar(100)\n \"second_service_points_won\" varchar(100)\n \"second_service_points_won_pct\" varchar(100)\n \"service_games_played\" varchar(100)\n \"service_games_won\" varchar(100)\n \"service_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_saved\" varchar(100)\n \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"numerator\" varchar(100)\n \"denominator\" varchar(100)\n \"prediction\" varchar(100)\n \"payout_calculation\" varchar(100)\n \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"line_value\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_value\" varchar(100)\n \"line_value_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_over\" varchar(100)\n \"line_under\" varchar(100)\n \"total\" varchar(100)\n \"total_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"temperature\" varchar(100)\n \"temperature_units\" varchar(40)\n \"humidity\" varchar(100)\n \"clouds\" varchar(100)\n \"wind_direction\" varchar(100)\n \"wind_velocity\" varchar(100)\n \"weather_code\" varchar(100)\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql"
}
}
================================================
FILE: tasks/postgres/easy/sports/create_performance_indexes/verify.py
================================================
"""
Verification script for PostgreSQL Sports Task 3: Query Performance Optimization
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.001 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.001:
return False
elif isinstance(actual, float) and isinstance(expected, float):
if abs(actual - expected) > 0.001:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE", "sports"),
"user": os.getenv("POSTGRES_USERNAME", "postgres"),
"password": os.getenv("POSTGRES_PASSWORD", "postgres")
}
def verify_performance_optimization(conn) -> bool:
"""Verify that key performance optimization indexes have been implemented."""
with conn.cursor() as cur:
print("\n🔍 Checking for critical performance indexes...")
# Check 1: participants_events.participant_id index (critical for subqueries)
cur.execute("""
SELECT indexname, indexdef
FROM pg_indexes
WHERE schemaname = 'public'
AND tablename = 'participants_events'
AND indexdef LIKE '%participant_id%'
""")
participant_indexes = cur.fetchall()
has_participant_index = len(participant_indexes) > 0
# Check 2: stats table optimization (critical for subquery filtering)
cur.execute("""
SELECT indexname, indexdef
FROM pg_indexes
WHERE schemaname = 'public'
AND tablename = 'stats'
AND indexdef LIKE '%stat_holder_type%'
AND indexdef LIKE '%stat_holder_id%'
""")
stats_indexes = cur.fetchall()
has_stats_index = len(stats_indexes) > 0
# Report findings
critical_indexes_found = 0
if has_participant_index:
print("✅ Found participant filtering index on participants_events.participant_id")
critical_indexes_found += 1
else:
print("❌ Missing critical index on participants_events.participant_id")
if has_stats_index:
print("✅ Found subquery optimization index on stats table")
critical_indexes_found += 1
else:
print("❌ Missing critical index on stats table")
# Must have both critical indexes for this subquery-heavy query
if critical_indexes_found >= 2:
print(f"\n✅ Performance optimization: PASS ({critical_indexes_found}/2 critical indexes found)")
return True
else:
print(f"\n❌ Performance optimization: FAIL ({critical_indexes_found}/2 critical indexes found)")
print(" Create these critical indexes:")
print(" - CREATE INDEX ON participants_events(participant_id);")
print(" - CREATE INDEX ON stats(stat_holder_type, stat_holder_id);")
return False
def main():
"""Main verification function."""
print("=" * 50)
print("Verifying Sports Task 3: Query Performance Optimization")
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all components
success = verify_performance_optimization(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/chinook/customer_data_migration/description.md
================================================
Migrate customer data from an acquired company to PostgreSQL using efficient bulk operations.
## Your Mission:
Chinook Music Store has recently acquired "MelodyMart," a competing music retailer. Their customer database needs to be migrated into Chinook's PostgreSQL database.
## Migration Requirements:
1. **Process all customer records from the data table below** and migrate them into the `Customer` table
2. **Apply business logic during migration**:
- Assign `CustomerID` values starting from the next available ID
- Assign all customers to support representative with EmployeeId 3
- Set `Fax` field to NULL for all migrated customers
3. **Avoid individual INSERT statements**
## Customer Data to Migrate:
| FirstName | LastName | Company | Address | City | State | Country | PostalCode | Phone | Email |
|-----------|----------|---------|---------|------|-------|---------|------------|-------|--------|
| Danielle | Johnson | Sanchez-Taylor | 819 Johnson Course | East William | AK | USA | 74064 | 386-3794 | danielle.johnson@sancheztaylor.com |
| Katherine | Moore | Peterson-Moore | 16155 Roman Stream Suite 816 | New Kellystad | OK | USA | 25704 | 103-4131 | katherine_moore@petersonmoore.org |
| Joshua | Reid | Martin-Kelly | 192 Frank Light Suite 835 | East Lydiamouth | MO | USA | 35594 | 139-5376 | joshua_reid@martinkelly.org |
| Douglas | Taylor | Hoffman, Baker and Richards | 3287 Katelyn Wall Apt. 226 | South Patrickmouth | NC | USA | 33454 | 801-8451 | douglast@hoffmanbakerand.net |
| Ryan | Chavez | Liu, Baker and Mason | 148 Eric Track | New Stephanie | NC | USA | 00575 | 957-0154 | r.chavez@liubakerandmaso.com |
| Brian | Humphrey | Miller Group | 227 Joseph Well | Brandtside | WV | USA | 96174 | 346-5787 | brian.humphrey@millergroup.com |
| John | Brown | Chapman and Sons | 10310 Jones Freeway | Elizabethborough | ND | USA | 17843 | 997-3763 | john.brown@chapmanandsons.com |
| Collin | Jordan | Jenkins-Shields | 106 Mcbride Coves | East James | NV | USA | 18874 | 624-7317 | collin.jordan@jenkinsshields.com |
| Brent | Kidd | Novak and Sons | 7736 Franklin Alley | Bakermouth | LA | USA | 55945 | 872-3430 | brent.kidd@novakandsons.com |
| Julie | Brown | Woods, Calhoun and Schmidt | 121 Emma Freeway | Wilsonshire | IA | USA | 76381 | 909-1699 | julieb@woodscalhounand.net |
| Sarah | Harris | Edwards, Baker and Anderson | 5107 Charles Forest Suite 251 | West Justin | NV | USA | 71701 | 498-0841 | s.harris@edwardsbakerand.com |
| Joseph | Preston | Tran, Nelson and Jacobs | 48740 Cynthia Village Suite 005 | Lake Tina | GA | USA | 97655 | 786-8011 | j.preston@trannelsonandja.com |
| Amy | Davenport | Tran, Jordan and Williams | 53315 Dickson Summit Apt. 322 | Johnsonmouth | WY | USA | 54465 | 342-1607 | a.davenport@tranjordanandwi.com |
| James | Sellers | Torres-Pope | 03654 Tammy Harbors | Darlenefurt | TX | USA | 70783 | 501-4294 | james.sellers@torrespope.com |
| Daniel | Hamilton | Hartman, Graham and Joyce | 9340 Smith Valley | West Ryan | TN | USA | 43780 | 951-4846 | danielh@hartmangrahaman.net |
| Richard | Phillips | Lee Ltd | 299 Sullivan Village Apt. 443 | Floydmouth | NH | USA | 58406 | 738-7214 | richardp@leeltd.net |
| Clarence | Crane | Chambers and Sons | 00379 Stanley Roads | Lake Heather | NM | USA | 52884 | 320-1632 | clarence_crane@chambersandsons.org |
| Brent | Wright | Bryant Group | 9868 Merritt Summit Suite 743 | Katiehaven | NM | USA | 82650 | 347-1434 | brentw@bryantgroup.net |
| Luis | Fernandez | Hernandez Group | 316 Rivera Mountain | Brownchester | MS | USA | 77057 | 096-7054 | luis_fernandez@hernandezgroup.org |
| Melissa | Ashley | Medina-Navarro | 3467 Paul Skyway | Ramseymouth | PW | USA | 17229 | 980-6990 | melissa.ashley@medinanavarro.com |
| Dawn | Taylor | White-Green | 75564 King Common Suite 080 | Jeffreyland | WI | USA | 85927 | 003-3092 | d.taylor@whitegreen.com |
| David | Caldwell | Gould, Marshall and Scott | 99124 Beth Inlet Suite 631 | North Heidi | ME | USA | 90188 | 919-0586 | davidc@gouldmarshallan.net |
| Casey | Holland | Atkinson Group | 5726 Jessica Run | Christinaside | WI | USA | 63873 | 769-4531 | caseyh@atkinsongroup.net |
| Nicole | Sanchez | Hudson-Barnett | 75273 Salinas Junctions Suite 948 | New Stacyland | IA | USA | 94882 | 678-3777 | nicole.sanchez@hudsonbarnett.com |
| Christopher | Walker | Sanchez, Beck and Wood | 8557 Parker Fort Apt. 351 | East Javier | NJ | USA | 36742 | 989-4134 | c.walker@sanchezbeckandw.com |
| Michael | Turner | Ferguson, Hill and Mccann | 271 Audrey Mountains Suite 752 | West Shelleyfort | DE | USA | 09065 | 671-9022 | michaelt@fergusonhilland.net |
| Christopher | Wright | Duran, Obrien and Gibbs | 677 Dalton Meadow | Ashleyton | RI | USA | 97505 | 133-4123 | c.wright@duranobrienandg.com |
| Andrea | Moore | Hayes-Wheeler | 34471 Sandra Turnpike Apt. 618 | Lake Edward | KY | USA | 19144 | 102-4994 | andrea_moore@hayeswheeler.org |
| David | Barker | Powell, Nelson and Fernandez | 90659 Johnson Forks Apt. 490 | South April | NV | USA | 36959 | 296-7175 | david_barker@powellnelsonand.org |
| Mathew | Santiago | Rivera Ltd | 6807 Leonard Islands Apt. 680 | Gutierrezborough | NC | USA | 47920 | 977-0348 | m.santiago@riveraltd.com |
| Sara | Kim | Washington, Johnson and Mccoy | 248 Andrea Course | Port Robin | NH | USA | 15897 | 274-8467 | sara_kim@washingtonjohns.org |
| John | Arnold | Lee-Greene | 46584 Justin Hills | Grimesmouth | ND | USA | 63984 | 558-8675 | j.arnold@leegreene.com |
| Tina | Allen | Hall-Rowe | 7662 Hanna Crossroad | Mollymouth | CT | USA | 69438 | 702-6217 | tinaa@hallrowe.net |
| Matthew | Schwartz | Miller, Murphy and Craig | 7809 Jimmy Spur Suite 316 | Port Cynthiaville | NV | USA | 22306 | 400-5045 | matthews@millermurphyand.net |
| Ryan | Sanchez | Knight-Sparks | 19693 Durham Divide | South Dana | NH | USA | 33967 | 074-8217 | ryans@knightsparks.net |
| Vanessa | Evans | Vaughn-Bryant | 67136 Andrews Squares Suite 064 | New Michelleton | PW | USA | 79983 | 743-9533 | vanessae@vaughnbryant.net |
| Erica | Le | Becker, Taylor and Davis | 7095 Christopher Hill | Julieburgh | ID | USA | 17823 | 858-8424 | erica_le@beckertaylorand.org |
| Tammy | Phillips | Brock-Mcdonald | 36851 Smith Plain | South Miguelview | OR | USA | 50442 | 513-7098 | tammyp@brockmcdonald.net |
| Rose | Walker | Reid Group | 612 Sophia Hollow Suite 113 | South Shawn | TN | USA | 97905 | 869-2617 | rose_walker@reidgroup.org |
| Sheila | Ramirez | Wood, Ramos and Sampson | 58506 Lopez Crossing Suite 139 | North Kristinbury | DC | USA | 74501 | 318-3933 | sheilar@woodramosandsam.net |
| Kim | Kramer | Smith, Garrison and Thomas | 421 David Knolls | New Mario | HI | USA | 35283 | 026-8117 | kim_kramer@smithgarrisonan.org |
| Kimberly | Palmer | Hayes and Sons | 847 Bruce Neck | Simmonsville | NM | USA | 93876 | 711-5921 | k.palmer@hayesandsons.com |
| Joshua | Schultz | Joseph, James and Harper | 8961 Melissa Run Apt. 673 | Morganmouth | MO | USA | 55025 | 156-5452 | joshua_schultz@josephjamesandh.org |
| Carlos | Decker | Reynolds Ltd | 80988 Santiago Loop Suite 604 | Michaelshire | NY | USA | 28385 | 273-1585 | carlos.decker@reynoldsltd.com |
| Kathryn | Andrews | Bruce-Villegas | 402 Park Inlet | Michaelburgh | VI | USA | 19277 | 961-2018 | k.andrews@brucevillegas.com |
| Nicholas | Chavez | Wood Ltd | 910 Eric River Apt. 147 | Tuckermouth | MT | USA | 36305 | 381-5614 | nicholas_chavez@woodltd.org |
| Alison | Parker | Foster PLC | 34324 Murphy Avenue | Burgessburgh | DC | USA | 50335 | 838-8516 | alison.parker@fosterplc.com |
| Ryan | Stevens | Atkins PLC | 664 Richard Islands Apt. 975 | South Meganbury | NE | USA | 77685 | 681-6453 | ryans@atkinsplc.net |
| Kimberly | Jones | Wilson, Hicks and Bullock | 2312 Gonzalez Rapids Apt. 127 | Webstershire | NV | USA | 89778 | 995-5271 | kimberly_jones@wilsonhicksandb.org |
| Scott | Turner | Vargas-Bell | 7700 Decker Club | New Brookefurt | NH | USA | 76565 | 807-9359 | scott_turner@vargasbell.org |
| Walter | Rosario | Garcia-Nolan | 182 John Mill Suite 889 | West Nathan | LA | USA | 51280 | 659-0515 | walter.rosario@garcianolan.com |
| Angela | Hughes | Cummings-Douglas | 1925 Ponce Square | Andersonland | ME | USA | 73760 | 652-8168 | angelah@cummingsdouglas.net |
| Andrew | Parker | Peterson Group | 22141 Ebony Wells | New Nicholas | GA | USA | 24204 | 927-0653 | andrew_parker@petersongroup.org |
| Cheryl | Goodwin | Young-Allen | 59774 Shaw Manor Apt. 392 | Brettfort | VI | USA | 49156 | 818-1412 | cherylg@youngallen.net |
| Shannon | Palmer | Davis-Lozano | 0606 Young Common Suite 305 | Port Jennifermouth | WY | USA | 19643 | 204-7277 | shannon.palmer@davislozano.com |
| Rebecca | Smith | Conley PLC | 43410 Robert Underpass Suite 117 | Lake Zacharybury | VT | USA | 19319 | 460-9539 | rebecca_smith@conleyplc.org |
| Jacob | Barnett | Villegas, Jones and Fox | 7065 Burgess Knolls | West Johnville | WI | USA | 76772 | 520-5852 | jacob_barnett@villegasjonesan.org |
| Tina | Mendoza | Cain Inc | 43030 Mahoney Passage Suite 874 | Port Deborahport | MI | USA | 06766 | 541-5667 | tina_mendoza@caininc.org |
| Matthew | Lopez | Jimenez, Glass and Stone | 616 Amy Islands | North Markport | ME | USA | 58948 | 962-7570 | matthewl@jimenezglassand.net |
| Christina | Graham | Whitney, Gould and Jones | 8202 Johnson Cliff Apt. 556 | New Ericmouth | MN | USA | 49261 | 719-2856 | christinag@whitneygouldand.net |
| Debra | Wright | Johnson and Sons | 681 Hampton Squares Suite 394 | Gonzalezberg | PR | USA | 10207 | 727-1551 | debraw@johnsonandsons.net |
| Patricia | York | Mckinney, Graves and Thompson | 313 Joel Park Apt. 589 | Tannerside | DC | USA | 80710 | 114-6786 | patricia_york@mckinneygravesa.org |
| Madeline | Jones | Day-Cole | 89226 Marie Path Apt. 422 | Sarahbury | MI | USA | 68513 | 414-3842 | madelinej@daycole.net |
| Christina | Davis | Jackson, David and Moore | 001 Stacy Trail Suite 396 | South Pamelaside | LA | USA | 84637 | 473-6471 | christina.davis@jacksondavidand.com |
| Eric | Perry | Harris-Lawson | 556 Kathleen Passage Apt. 537 | West Shannonberg | CT | USA | 07133 | 469-6325 | ericp@harrislawson.net |
| James | Moore | Owens, Koch and Jimenez | 8733 Williams Haven | Harperfort | LA | USA | 70846 | 016-2456 | jamesm@owenskochandjim.net |
| Brandon | Williams | Lee, Tran and Jones | 499 David Court Suite 558 | Kariborough | PA | USA | 67232 | 680-0025 | brandon_williams@leetranandjones.org |
| April | Hernandez | Taylor, Velazquez and Flores | 495 Erickson Hills Suite 055 | South Brandytown | PA | USA | 62706 | 499-3097 | a.hernandez@taylorvelazquez.com |
| Alexandria | Griffith | Hernandez-Becker | 130 Edwards Drive | Vaughnchester | NY | USA | 80648 | 702-8385 | alexandria_griffith@hernandezbecker.org |
| Alicia | Edwards | Stevens PLC | 549 Lee Gateway Suite 843 | Kellieborough | UT | USA | 92905 | 757-5844 | alicia.edwards@stevensplc.com |
| Ashley | Daniels | Cardenas-Blevins | 0415 Douglas Summit | Lewisside | KY | USA | 74165 | 421-9933 | ashley.daniels@cardenasblevins.com |
| Elizabeth | Schmidt | Hall, Garcia and Rivera | 20826 Woods Flats Suite 540 | Lake Audreyside | WA | USA | 95281 | 026-2067 | e.schmidt@hallgarciaandri.com |
| Sharon | Hayden | Mcdowell-Smith | 4788 Small Dale | Nelsonville | MA | USA | 21799 | 742-0549 | s.hayden@mcdowellsmith.com |
| Gregory | Chase | Wilcox-Robertson | 1227 Boyle Avenue | Patrickmouth | WV | USA | 35496 | 549-9045 | g.chase@wilcoxrobertson.com |
| Bryan | Wilson | Moore-Parks | 145 Jeffrey Dale Suite 279 | Robertside | PW | USA | 62213 | 833-9187 | bryanw@mooreparks.net |
| Christian | Elliott | Poole PLC | 822 Bond Mills | Lake Jamieshire | NM | USA | 12420 | 870-7286 | christian_elliott@pooleplc.org |
| Anne | Hansen | Roman, Cummings and Foster | 391 Rodney Squares | New Virginialand | NJ | USA | 04660 | 462-2656 | anne_hansen@romancummingsan.org |
| Molly | Knox | Miller-Brandt | 512 Rice Stream | Port Adam | AK | USA | 39608 | 786-8633 | molly_knox@millerbrandt.org |
| Michael | Hill | Cannon, Johnson and Keller | 31190 Harper Squares | East Joyfurt | NV | USA | 31216 | 830-2843 | michaelh@cannonjohnsonan.net |
| Barbara | Barton | Young-Walter | 4408 Connie Meadow | Williamsstad | SD | USA | 88495 | 685-6624 | barbara_barton@youngwalter.org |
| Ivan | Medina | Atkinson LLC | 0866 Paul Glens | West Deborah | NV | USA | 49138 | 183-0469 | ivan.medina@atkinsonllc.com |
| Morgan | Lopez | Ramsey, Hansen and Mendoza | 0331 Rocha Square Apt. 638 | Kimberlyfurt | NH | USA | 70447 | 544-5877 | morgan.lopez@ramseyhansenand.com |
| Leah | Bowen | Rocha-Wood | 93204 Phillips Flat Suite 369 | South Andrea | TX | USA | 44746 | 477-7252 | l.bowen@rochawood.com |
| Jennifer | Freeman | Mooney, Bernard and Warren | 006 Megan Fort | Lake Edwardborough | NY | USA | 60271 | 509-9770 | jennifer.freeman@mooneybernardan.com |
| Amanda | Jenkins | Moreno LLC | 86211 John River Suite 546 | West Susanmouth | OK | USA | 32378 | 341-0166 | amanda_jenkins@morenollc.org |
| Angela | Brown | Warner Inc | 5918 Jerry Ways Suite 401 | Rachelshire | TN | USA | 04813 | 250-3926 | angela.brown@warnerinc.com |
| Kevin | Elliott | Davenport, Price and Mosley | 2185 Connor Fort Apt. 599 | Novakmouth | AK | USA | 83616 | 477-3586 | kevin_elliott@davenportpricea.org |
| Jacob | Willis | Miller-Montgomery | 114 Norman Tunnel | Lake Peter | MN | USA | 14466 | 104-7541 | j.willis@millermontgomer.com |
| Christopher | Jordan | Peters, Russell and Johnson | 199 Shields Bridge Suite 661 | New Adriana | TX | USA | 50404 | 224-4472 | christopher.jordan@petersrussellan.com |
| Gary | Hill | Washington-Jones | 79937 Derek Avenue Suite 596 | Scottchester | GU | USA | 85833 | 924-5937 | garyh@washingtonjones.net |
| Gregory | Sanders | Carter-Neal | 356 Velasquez Lock Suite 193 | Lake Katrina | AK | USA | 95818 | 737-4167 | g.sanders@carterneal.com |
| Cynthia | Allen | Moore, Henderson and Bennett | 796 Stephens Turnpike Suite 891 | Port Johnstad | GA | USA | 85304 | 909-6561 | cynthia.allen@moorehendersona.com |
| Corey | Walker | Stone, Carpenter and Johnston | 6798 Michael Burg Suite 146 | North Marieberg | MI | USA | 41381 | 573-8757 | corey.walker@stonecarpentera.com |
| Samuel | Horton | Jones-Williams | 51238 Andrea Isle | Mullenbury | AS | USA | 53591 | 226-6093 | samuel_horton@joneswilliams.org |
| Brittany | Price | Lewis, Ramirez and Padilla | 182 Nguyen Mount | West Emilyfort | NC | USA | 84270 | 596-9691 | brittanyp@lewisramirezand.net |
| Michael | Ellis | Cervantes Ltd | 912 Wilson Inlet Apt. 252 | Barnesberg | OK | USA | 50794 | 627-8282 | michael_ellis@cervantesltd.org |
| Keith | Lopez | Harvey-Glenn | 2368 Ortiz Overpass | Mckinneymouth | NM | USA | 22423 | 190-3404 | k.lopez@harveyglenn.com |
| Amanda | Jackson | Cunningham-Barton | 819 Joseph Plains Suite 807 | South Curtis | MP | USA | 86179 | 340-7451 | amanda_jackson@cunninghambarto.org |
| Michelle | Wilson | Clark Ltd | 962 Kristen Via Apt. 095 | Candiceburgh | MD | USA | 92782 | 449-4812 | michelle_wilson@clarkltd.org |
| Samantha | Riddle | Martinez, Cline and Wright | 67294 Brooks Club Apt. 684 | Shawnfort | MD | USA | 76779 | 017-5186 | s.riddle@martinezclinean.com |
| Tammy | Summers | Adams-Clayton | 929 Kramer Springs Apt. 017 | North Sarahburgh | NV | USA | 60337 | 063-2424 | tammy.summers@adamsclayton.com |
| Diamond | Wright | Beck-Banks | 4361 Aaron Neck | East Brittneyhaven | TX | USA | 58836 | 005-1627 | diamond.wright@beckbanks.com |
| Jeremy | Davis | Garcia LLC | 62218 Chelsey Expressway Suite 532 | Jensenmouth | VI | USA | 28975 | 112-1965 | jeremy_davis@garciallc.org |
| Leonard | Taylor | Newman-Wright | 043 Julie Hill Apt. 376 | East Victorland | NC | USA | 02082 | 552-6965 | l.taylor@newmanwright.com |
| Kathryn | Best | Smith Inc | 3006 Fuller Parkway | Hendersonfurt | CO | USA | 84457 | 889-2414 | kathryn.best@smithinc.com |
| William | Harris | Herrera Group | 6303 Sandy Crescent | Salazarton | ME | USA | 87805 | 210-2027 | williamh@herreragroup.net |
| Alexandra | Logan | Green, Watson and Brady | 105 Nelson Circles Suite 917 | Dixonton | NM | USA | 74803 | 252-4191 | a.logan@greenwatsonandb.com |
| Joyce | Smith | Sanchez Group | 2208 Walker Gateway Suite 541 | Davidton | HI | USA | 29754 | 806-1744 | joyces@sanchezgroup.net |
| Christopher | Bryant | Gonzalez-Elliott | 937 Vargas Park Apt. 832 | South Andrewside | MI | USA | 83855 | 050-6413 | c.bryant@gonzalezelliott.com |
| Robert | Woodward | Dawson Inc | 86571 William Route | Jonesshire | AR | USA | 57515 | 234-4565 | robertw@dawsoninc.net |
| Shawn | Hall | Taylor PLC | 12775 Martinez Knolls | South Kyle | KS | USA | 16218 | 124-9035 | s.hall@taylorplc.com |
| Christopher | Wright | Foster-Williams | 2067 Cody Cove Apt. 100 | East James | MO | USA | 49291 | 199-4101 | c.wright@fosterwilliams.com |
| Rachel | Ramos | Davis LLC | 70296 Crawford Light | Thompsonborough | PW | USA | 25031 | 447-2099 | r.ramos@davisllc.com |
| Deborah | Porter | Mendoza, Miller and Reyes | 83806 Castillo Tunnel Suite 598 | Paulburgh | AK | USA | 42296 | 930-4078 | deborahp@mendozamilleran.net |
| Katie | Key | Garcia Ltd | 8039 Kelly Villages | East Joel | MD | USA | 97245 | 590-5992 | k.key@garcialtd.com |
| Mary | Cochran | Weaver-Thompson | 03930 Smith Ridges | Port David | VT | USA | 23761 | 500-2921 | maryc@weaverthompson.net |
| Susan | Brooks | Foster, Garcia and Turner | 67528 Walker Radial | South Kurt | UT | USA | 39103 | 220-9690 | s.brooks@fostergarciaand.com |
| Carrie | Mccall | Walker, Cunningham and Zuniga | 1355 Daisy Corners | Seanview | IL | USA | 33208 | 154-1006 | carrie_mccall@walkercunningha.org |
| Jessica | Costa | Snyder-Gray | 79327 Lauren Bypass Suite 054 | North Matthewfurt | GA | USA | 96443 | 181-5997 | jessica.costa@snydergray.com |
| Ryan | Valdez | Preston, Moore and Garcia | 68844 Young Causeway | Armstrongfort | FL | USA | 07645 | 506-1497 | r.valdez@prestonmooreand.com |
| Collin | Clark | Carter, Miller and Anthony | 7741 Lopez Light Suite 270 | Scottview | IN | USA | 35701 | 902-1158 | collin_clark@cartermillerand.org |
| Tara | Lawrence | Brown, Hughes and Mills | 374 Ralph Walk Apt. 898 | North Stacy | NV | USA | 23160 | 233-2061 | tara_lawrence@brownhughesandm.org |
| James | Carson | Flowers LLC | 116 Arnold Walks Suite 870 | Rodriguezberg | FL | USA | 74765 | 991-1914 | jamesc@flowersllc.net |
| Natalie | Baker | Washington, Lynch and Johnson | 2996 Randy Isle Apt. 074 | Andrewport | ME | USA | 37246 | 713-2475 | natalieb@washingtonlynch.net |
| Jessica | Jacobs | Lopez and Sons | 785 Zachary Estate Apt. 486 | Port Melissa | FM | USA | 75038 | 023-3030 | jessica_jacobs@lopezandsons.org |
| Brent | Ward | Hill Group | 103 Burns Mission Apt. 798 | Maxview | WA | USA | 90790 | 140-6029 | b.ward@hillgroup.com |
| Mercedes | Holland | Clark, Pearson and Palmer | 2290 Johnny Valley | Jenniferview | NE | USA | 49846 | 574-3748 | mercedes_holland@clarkpearsonand.org |
| Breanna | Smith | Levy, Franco and Hoffman | 1715 Davidson Wall Suite 443 | New Kathy | MH | USA | 07942 | 965-2074 | breannas@levyfrancoandho.net |
| Rebecca | Sullivan | Johnson, Erickson and Armstrong | 3875 Bruce Ville | West Connor | DC | USA | 97614 | 482-5135 | r.sullivan@johnsonerickson.com |
| Julie | Parker | Watson-Richards | 70999 Thomas Fields Apt. 684 | Brownberg | DC | USA | 26754 | 569-7252 | julie.parker@watsonrichards.com |
| Tony | Welch | Edwards Inc | 4329 Tracy Track | East Christinachester | MO | USA | 56734 | 760-0835 | tony.welch@edwardsinc.com |
| Patricia | Sherman | Lee, Rhodes and Sims | 54216 Jackson View | West Stacymouth | VA | USA | 68696 | 985-6257 | patricias@leerhodesandsim.net |
| Karen | Martin | Smith-Walker | 09821 Dawson Turnpike | South Nancyview | WI | USA | 70589 | 909-0100 | karen.martin@smithwalker.com |
| Robert | James | King, Miles and Harris | 6184 Robert Cove | West Danielville | NM | USA | 26538 | 934-8356 | robertj@kingmilesandhar.net |
| Ethan | Kelley | Watts Group | 00119 Hernandez Course Apt. 143 | Hintonport | KS | USA | 61354 | 012-0455 | ethan_kelley@wattsgroup.org |
| Joanna | Davis | Smith and Sons | 5794 Nathan Junctions | North Richard | NH | USA | 36130 | 676-2120 | j.davis@smithandsons.com |
| Dale | Pruitt | Pham-Gregory | 659 Michelle Villages | South Samantha | DE | USA | 54408 | 701-4508 | d.pruitt@phamgregory.com |
| Tiffany | Santiago | Stone-Watts | 3756 Mary Point | North Dawnburgh | NY | USA | 62011 | 721-7535 | tiffanys@stonewatts.net |
| Brent | Walker | Gray, Montoya and Miller | 717 Stewart Parks Apt. 166 | New Andrealand | WY | USA | 79695 | 948-8375 | brentw@graymontoyaandm.net |
| Marcia | Velasquez | Rivera-Saunders | 571 Katherine Forges Apt. 554 | Jacquelineton | MH | USA | 22017 | 726-1493 | m.velasquez@riverasaunders.com |
| David | Phelps | Bryant and Sons | 60917 Barrett Parkways Apt. 708 | New Savannahshire | NJ | USA | 67129 | 292-2169 | davidp@bryantandsons.net |
| William | Cruz | Moon, Farmer and Hill | 7226 Cameron Plaza Suite 833 | New Jennifer | TX | USA | 45759 | 228-8515 | william_cruz@moonfarmerandhi.org |
| Brandi | Bender | Butler, Adkins and Skinner | 0810 Thomas Skyway Apt. 342 | Francesberg | MP | USA | 08631 | 438-0571 | b.bender@butleradkinsand.com |
| Julia | Hoffman | Dixon Ltd | 066 Frye Spur Suite 800 | Jamesmouth | MP | USA | 30064 | 598-9334 | julia_hoffman@dixonltd.org |
| Gregory | Fleming | Rivers Ltd | 0648 Anderson Prairie | Adammouth | VT | USA | 20791 | 025-9094 | gregory_fleming@riversltd.org |
| Kristy | Pierce | Bowers LLC | 81826 Davis Forges | Lake Martin | MN | USA | 38980 | 398-7801 | kristyp@bowersllc.net |
| Sean | Conway | Sellers, Sanchez and Williams | 1648 Johnson Path Suite 887 | Williamsborough | MD | USA | 67858 | 112-8801 | s.conway@sellerssancheza.com |
| Ellen | Ayala | Coleman, Garcia and Medina | 120 Love Camp Apt. 102 | Angelashire | GU | USA | 30338 | 466-7665 | ellen.ayala@colemangarciaan.com |
| Perry | Wilson | May PLC | 901 Reilly Coves | Kristinport | PA | USA | 11839 | 476-6072 | p.wilson@mayplc.com |
| Derek | Myers | Phillips, Walters and Evans | 88210 Ashley Lock Apt. 435 | South Rebecca | PR | USA | 67682 | 222-3943 | derek.myers@phillipswalters.com |
| Howard | Marsh | York PLC | 814 John Flat Suite 552 | North Justin | CA | USA | 25863 | 577-5949 | h.marsh@yorkplc.com |
| Ariana | Diaz | Benjamin-Jackson | 36452 Humphrey Mountain Suite 547 | East Debbieland | MP | USA | 37281 | 283-4110 | ariana.diaz@benjaminjackson.com |
| Lisa | Riley | Lewis, Johnson and Green | 256 Patricia Radial Suite 278 | South Michaeltown | TN | USA | 31811 | 928-2722 | l.riley@lewisjohnsonand.com |
| Jill | Webb | Williams-Juarez | 45303 Hughes Motorway | North Tinamouth | CT | USA | 92741 | 844-9892 | jill_webb@williamsjuarez.org |
| Desiree | Diaz | Villanueva, Miller and King | 655 Sparks Rapids | New Nicolemouth | GA | USA | 30646 | 184-3222 | desireed@villanuevamille.net |
| Carolyn | Montoya | Hall, Shepherd and Cortez | 773 Deborah Loop Apt. 302 | East Crystal | AZ | USA | 75509 | 202-4286 | carolyn.montoya@hallshepherdand.com |
| Natalie | Luna | Valentine-Robinson | 2369 Laura View Apt. 984 | Lake Gina | NH | USA | 78689 | 913-6621 | natalie.luna@valentinerobins.com |
| James | Heath | Cohen, Serrano and Jacobs | 9908 Christopher Shoals | New Amber | AL | USA | 89441 | 686-5086 | j.heath@cohenserranoand.com |
| Shawna | Olson | Bell-Ballard | 2473 Justin Wells | Scotttown | VT | USA | 97972 | 098-1806 | s.olson@bellballard.com |
| Gwendolyn | Stewart | Rodriguez-Simmons | 8695 Braun Locks Apt. 688 | Whiteside | OH | USA | 63908 | 449-5621 | g.stewart@rodriguezsimmon.com |
| Sean | Lyons | Garcia PLC | 8902 Oconnell Avenue Apt. 279 | Davisview | IN | USA | 49107 | 190-6698 | seanl@garciaplc.net |
| Jennifer | Harper | Bowman Group | 84309 Christina Spring | West Johntown | GA | USA | 11883 | 465-6693 | jennifer.harper@bowmangroup.com |
| Jillian | Jones | Dunn Ltd | 4393 Spears Ports Apt. 426 | New Charlesport | MA | USA | 15837 | 848-9476 | jillian_jones@dunnltd.org |
| Kayla | Todd | Maldonado-Mosley | 1416 Erica Forks | Robertstad | NC | USA | 70709 | 043-4165 | kayla.todd@maldonadomosley.com |
| Angela | White | Gomez-Shannon | 37333 Clark Flats Apt. 952 | North Samanthafort | RI | USA | 01369 | 807-5957 | angelaw@gomezshannon.net |
| Travis | Joyce | Ramirez, Walker and Ray | 678 Wayne Lock | South Tiffany | UT | USA | 68423 | 750-0369 | travis.joyce@ramirezwalkeran.com |
| Mark | Salazar | Lopez-Baker | 9552 Coleman Manor Suite 564 | Whiteberg | OK | USA | 90417 | 314-3866 | m.salazar@lopezbaker.com |
| Dustin | Haley | Kennedy Inc | 7288 Floyd Hills | Annashire | AR | USA | 52720 | 120-3471 | dustin_haley@kennedyinc.org |
| Julie | Green | Castro-Frederick | 0615 Barbara Run Apt. 455 | Hamptonmouth | FM | USA | 10778 | 694-7225 | julie_green@castrofrederick.org |
| Crystal | Duncan | Miller LLC | 5449 Nelson Mills | Juliehaven | NV | USA | 54763 | 220-2341 | c.duncan@millerllc.com |
| Garrett | Garcia | Zuniga Group | 68114 Christopher Loaf | Jeromeport | NV | USA | 82615 | 228-2005 | garrettg@zunigagroup.net |
| Michelle | Mcdonald | Donovan, Dunn and Taylor | 979 Mills Route | Reginafort | ND | USA | 30271 | 174-5642 | michellem@donovandunnandt.net |
| Alex | Mills | Cooper Group | 774 Katie Union | Carlatown | OH | USA | 49475 | 368-6632 | alex_mills@coopergroup.org |
| Maria | Walker | Henderson and Sons | 8463 Ian Highway Apt. 797 | Jackiefort | ID | USA | 42528 | 020-8021 | mariaw@hendersonandson.net |
| Joseph | Espinoza | Smith, Davis and Smith | 6475 Terry Bypass | Christopherberg | AR | USA | 35432 | 618-7234 | joseph_espinoza@smithdavisandsm.org |
| Maria | Martinez | Wright, Wise and Ramos | 71837 Maldonado Inlet | Ericton | WA | USA | 72535 | 814-7435 | maria.martinez@wrightwiseandra.com |
| Michelle | Robinson | Young Group | 24916 Albert Canyon Suite 925 | East Ericland | TX | USA | 81588 | 500-5281 | m.robinson@younggroup.com |
| Tony | Stewart | Kramer, Sherman and Trujillo | 306 Ramsey Glen Apt. 778 | Amyfort | ID | USA | 74779 | 285-5749 | t.stewart@kramershermanan.com |
| Casey | Moore | Weiss-Weaver | 86209 Parsons Garden Suite 186 | New Felicia | WI | USA | 72782 | 294-5651 | casey.moore@weissweaver.com |
| Alexandra | Jones | White Inc | 73109 Barrett Pine | Brandonbury | PA | USA | 94590 | 103-7170 | alexandraj@whiteinc.net |
| Angela | Hurley | Short-Bauer | 480 Mary Club | New Colton | VA | USA | 30780 | 863-3839 | a.hurley@shortbauer.com |
| Angela | Grant | Garcia, Fowler and Howard | 612 Andrea Parkways Suite 289 | Mahoneymouth | OH | USA | 43054 | 566-5939 | a.grant@garciafowlerand.com |
| Nicholas | Pierce | King, Nixon and West | 04908 Victoria Hollow Apt. 433 | Andrewview | PW | USA | 73070 | 889-9210 | nicholas_pierce@kingnixonandwes.org |
| Michael | Taylor | Preston-Wright | 1969 Jessica Stream Suite 727 | New Dawnton | VA | USA | 76035 | 610-5566 | michael.taylor@prestonwright.com |
| Molly | Perez | Atkinson, Mcfarland and Walters | 48058 Mark Square Apt. 206 | Mullinsshire | NY | USA | 12308 | 364-6225 | molly.perez@atkinsonmcfarla.com |
| Thomas | Mcgee | Ross, Miller and Shaw | 78376 Ann Street | East Charles | WI | USA | 56870 | 591-1665 | thomasm@rossmillerandsh.net |
| James | Cooper | Johnson, Torres and Huerta | 270 James Landing Apt. 110 | New Sara | VI | USA | 38208 | 051-4770 | jamesc@johnsontorresan.net |
| Jason | Medina | Payne LLC | 206 Jonathan Circle Suite 394 | South Dianatown | CA | USA | 51441 | 451-0463 | jason_medina@paynellc.org |
| William | Mckinney | Washington-Harper | 38780 John Pines | Matthewfurt | WA | USA | 21079 | 055-5438 | williamm@washingtonharpe.net |
| Lisa | Garrett | Zamora-Briggs | 432 Prince Shoals | North Jessica | NC | USA | 89367 | 936-3926 | lisag@zamorabriggs.net |
| Renee | Murphy | Anderson, Delgado and Carpenter | 48262 Lonnie Point | East Lonnieberg | VA | USA | 04365 | 566-4742 | r.murphy@andersondelgado.com |
| Daniel | Lopez | Jensen, Obrien and Salazar | 05172 Joseph Landing | Port Paul | NJ | USA | 18525 | 233-0604 | daniel_lopez@jensenobrienand.org |
| Jeffrey | Powers | Todd Inc | 9757 Ronald Trail | New Jillfurt | VA | USA | 41513 | 699-9880 | jeffrey.powers@toddinc.com |
| Shannon | Wilcox | Rich and Sons | 086 James Mill Suite 447 | South Kelly | PW | USA | 07650 | 827-7181 | s.wilcox@richandsons.com |
| Kimberly | Pace | Payne, Long and Morris | 79371 Nguyen Run | Lake Jessica | CO | USA | 15464 | 751-8689 | k.pace@paynelongandmor.com |
| Nicholas | James | Barr PLC | 22064 Cross Mission | Courtneyville | MH | USA | 17746 | 309-4077 | nicholas_james@barrplc.org |
| Amy | Smith | Young-Chapman | 6719 John Plaza Suite 983 | East Eddiestad | AZ | USA | 19555 | 099-4510 | amy.smith@youngchapman.com |
| Robert | Thompson | Mitchell, Guerrero and Graves | 9501 Morris Light | Port Ronaldside | CA | USA | 38883 | 721-4586 | r.thompson@mitchellguerrer.com |
| Heather | Salazar | Duncan Ltd | 9469 Green Ports | Sarashire | NM | USA | 68619 | 772-9343 | heather.salazar@duncanltd.com |
| David | Marshall | Mclaughlin and Sons | 0558 Alex Flats Suite 414 | Williammouth | WI | USA | 01304 | 155-6990 | d.marshall@mclaughlinandso.com |
================================================
FILE: tasks/postgres/standard/chinook/customer_data_migration/meta.json
================================================
{
"task_id": "customer_data_migration",
"task_name": "Customer Data Migration",
"category_id": "chinook",
"category_name": "Chinook",
"description": "Migrate customer data from acquired company MelodyMart into Chinook database using bulk operations and business logic.",
"author": "Lingxiao Du",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"data migration",
"transactional operations"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
}
}
================================================
FILE: tasks/postgres/standard/chinook/customer_data_migration/verify.py
================================================
"""
Verification script for PostgreSQL Task 2: Customer Data Migration
"""
import os
import sys
import psycopg2
import pickle
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def load_expected_customers():
"""Load the expected customer data from pickle file."""
import os
script_dir = os.path.dirname(os.path.abspath(__file__))
pkl_path = os.path.join(script_dir, 'customer_data.pkl')
try:
with open(pkl_path, 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
print(f"❌ customer_data.pkl not found at {pkl_path}. Please generate customer data first.")
return None
except Exception as e:
print(f"❌ Error loading customer data: {e}")
return None
def verify_migrated_customers(conn, expected_customers) -> bool:
"""Verify migrated customers by comparing with expected data as sets."""
with conn.cursor() as cur:
# Get all customers with ID > 59 (the migrated ones)
cur.execute('''
SELECT "FirstName", "LastName", "Company", "Address", "City",
"State", "Country", "PostalCode", "Phone", "Email",
"SupportRepId", "Fax"
FROM "Customer"
WHERE "CustomerId" > 59
''')
actual_customers = cur.fetchall()
if len(actual_customers) != len(expected_customers):
print(f"❌ Expected {len(expected_customers)} migrated customers, found {len(actual_customers)}")
return False
# Convert expected customers to tuples for set comparison
expected_tuples = set()
for expected in expected_customers:
expected_tuple = (
expected['FirstName'], expected['LastName'], expected['Company'],
expected['Address'], expected['City'], expected['State'],
expected['Country'], expected['PostalCode'], expected['Phone'],
expected['Email'], 3, None # SupportRepId=3, Fax=None
)
expected_tuples.add(expected_tuple)
# Convert actual customers to set with proper type conversion
actual_tuples = set()
for row in actual_customers:
# Convert all fields to strings for consistent comparison
actual_tuple = (
str(row[0]) if row[0] is not None else '', # FirstName
str(row[1]) if row[1] is not None else '', # LastName
str(row[2]) if row[2] is not None else '', # Company
str(row[3]) if row[3] is not None else '', # Address
str(row[4]) if row[4] is not None else '', # City
str(row[5]) if row[5] is not None else '', # State
str(row[6]) if row[6] is not None else '', # Country
str(row[7]) if row[7] is not None else '', # PostalCode
str(row[8]) if row[8] is not None else '', # Phone
str(row[9]) if row[9] is not None else '', # Email
int(row[10]) if row[10] is not None else None, # SupportRepId
row[11] # Fax (should be None)
)
actual_tuples.add(actual_tuple)
# Check if sets are equal
if actual_tuples != expected_tuples:
missing_in_actual = expected_tuples - actual_tuples
extra_in_actual = actual_tuples - expected_tuples
print(f"❌ Customer data sets don't match!")
if missing_in_actual:
print(f" Missing {len(missing_in_actual)} expected customers")
for missing in list(missing_in_actual)[:3]: # Show first 3
print(f" Missing: {missing[0]} {missing[1]} - {missing[2]}")
if len(missing_in_actual) > 3:
print(f" ... and {len(missing_in_actual) - 3} more")
if extra_in_actual:
print(f" Found {len(extra_in_actual)} unexpected customers")
for extra in list(extra_in_actual)[:3]: # Show first 3
print(f" Extra: {extra[0]} {extra[1]} - {extra[2]}")
if len(extra_in_actual) > 3:
print(f" ... and {len(extra_in_actual) - 3} more")
return False
print(f"✅ All {len(expected_customers)} customers migrated correctly")
print(f"✅ All customers assigned to SupportRepId 3")
print(f"✅ All customers have Fax field set to NULL")
print(f"✅ Customer data sets match exactly (order-independent)")
return True
def main():
"""Main verification function."""
print("=" * 60)
print("Verifying Customer Data Migration Task")
print("=" * 60)
# Load expected customer data
expected_customers = load_expected_customers()
if not expected_customers:
sys.exit(1)
print(f"Loaded {len(expected_customers)} expected customer records")
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify migration
success = verify_migrated_customers(conn, expected_customers)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/description.md
================================================
Manage employee hierarchy and customer assignments through systematic CRUD operations.
## Your Mission:
Chinook needs to reorganize their employee structure and reassign customer relationships. Complete a series of precise database modifications to update the employee hierarchy.
## Tasks to Complete:
### 1. **INSERT: Add New Employees**
Insert exactly 2 new employees into the Employee table:
- EmployeeId: 9, FirstName: 'Sarah', LastName: 'Johnson', Title: 'Sales Support Agent', ReportsTo: 2, BirthDate: '1985-03-15', HireDate: '2009-01-10', Address: '123 Oak Street', City: 'Calgary', State: 'AB', Country: 'Canada', PostalCode: 'T2P 5G3', Phone: '+1 (403) 555-0123', Fax: '+1 (403) 555-0124', Email: 'sarah.johnson@chinookcorp.com'
- EmployeeId: 10, FirstName: 'Mike', LastName: 'Chen', Title: 'Sales Support Agent', ReportsTo: 2, BirthDate: '1982-08-22', HireDate: '2009-01-10', Address: '456 Pine Ave', City: 'Calgary', State: 'AB', Country: 'Canada', PostalCode: 'T2P 5G4', Phone: '+1 (403) 555-0125', Fax: '+1 (403) 555-0126', Email: 'mike.chen@chinookcorp.com'
### 2. **UPDATE: Modify Existing Employee Information**
- Change Andrew Adams (EmployeeId = 1) title from 'General Manager' to 'CEO'
- Update Nancy Edwards (EmployeeId = 2) phone number to '+1 (403) 555-9999'
- Change all employees with Title = 'IT Staff' to have Title = 'IT Specialist'
### 3. **UPDATE: Reassign Some Customers to New Employees**
- Update customers with CustomerId 1, 2, 3 to have SupportRepId = 9 (Sarah Johnson)
- Update customers with CustomerId 4, 5, 6 to have SupportRepId = 10 (Mike Chen)
### 4. **UPDATE: Reorganize Reporting Structure**
- Change Sarah Johnson (EmployeeId = 9) to report to Andrew Adams (EmployeeId = 1) instead of Nancy Edwards
- Change Mike Chen (EmployeeId = 10) to also report to Andrew Adams (EmployeeId = 1)
### 5. **INSERT: Create Employee Performance Table**
Create a new table called `employee_performance`:
- `employee_id` (integer, foreign key to Employee)
- `customers_assigned` (integer)
- `performance_score` (decimal)
Insert records for employees 9 and 10 by calculating their actual customer assignments:
- Sarah Johnson: calculate actual number of customers assigned to her, performance score 4.5
- Mike Chen: calculate actual number of customers assigned to him, performance score 4.2
### 6. **DELETE: Remove IT Department Employee**
- Delete Robert King (EmployeeId = 7) from the Employee table
- Before deletion, handle all relationships:
- Find who Robert reports to and reassign any employees who report to Robert to report to Robert's manager instead
- Find all customers assigned to Robert as their support rep and reassign them to Robert's manager
### 7. **UPDATE: Promote Remaining IT Staff**
- Promote Laura Callahan (EmployeeId = 8) from 'IT Specialist' to 'Senior IT Specialist'
- Update her salary information by adding a new column `salary` to Employee table (decimal type)
- Set Laura's salary to 75000.00 and all other employees to 50000.00
### 8. **Final Verification Query**
Execute this exact query to verify all changes:
```sql
SELECT
COUNT(*) as total_employees,
COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count,
COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count,
COUNT(CASE WHEN "ReportsTo" = 1 THEN 1 END) as reports_to_ceo
FROM "Employee";
```
Expected result: total_employees = 9, ceo_count = 1, it_specialist_count = 0, reports_to_ceo = 4
## Business Rules:
* Use exact EmployeeId values as specified
* Maintain referential integrity between Employee and Customer tables
* All phone numbers must include country code format
* Email addresses must follow the pattern firstname.lastname@chinookcorp.com
## Expected Outcome:
The database should have exactly 10 employees total, with the new hierarchy structure in place and customer assignments updated accordingly.
================================================
FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/meta.json
================================================
{
"task_id": "employee_hierarchy_management",
"task_name": "Employee Hierarchy Management",
"category_id": "chinook",
"category_name": "Chinook",
"description": "Reorganize employee structure through CRUD operations including inserts, updates, deletes, and customer reassignments.",
"author": "Lingxiao Du",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"data migration",
"schema design",
"transactional operations"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
}
}
================================================
FILE: tasks/postgres/standard/chinook/employee_hierarchy_management/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Employee Hierarchy Management
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.01 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.01:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_employee_count_and_titles(conn) -> bool:
"""Verify the final employee count and title changes."""
with conn.cursor() as cur:
# Check the final verification query results
cur.execute("""
SELECT
COUNT(*) as total_employees,
COUNT(CASE WHEN "Title" = 'CEO' THEN 1 END) as ceo_count,
COUNT(CASE WHEN "Title" = 'IT Specialist' THEN 1 END) as it_specialist_count,
COUNT(CASE WHEN "ReportsTo" = 1 THEN 1 END) as reports_to_ceo
FROM "Employee"
""")
result = cur.fetchone()
total_employees, ceo_count, it_specialist_count, reports_to_ceo = result
# Expected: total_employees = 9, ceo_count = 1, it_specialist_count = 1, reports_to_ceo = 4
if total_employees != 9:
print(f"❌ Expected 9 total employees, got {total_employees}")
return False
if ceo_count != 1:
print(f"❌ Expected 1 CEO, got {ceo_count}")
return False
if it_specialist_count != 0:
print(f"❌ Expected 0 IT Specialists, got {it_specialist_count}")
return False
if reports_to_ceo != 4:
print(f"❌ Expected 4 employees reporting to CEO, got {reports_to_ceo}")
return False
print("✅ Employee count and title verification passed")
return True
def verify_specific_employees(conn) -> bool:
"""Verify specific employee records and modifications."""
with conn.cursor() as cur:
# Check all employee fields in one query
cur.execute("""
SELECT "EmployeeId", "LastName", "FirstName", "Title", "ReportsTo", "BirthDate",
"HireDate", "Address", "City", "State", "Country", "PostalCode",
"Phone", "Fax", "Email"
FROM "Employee"
WHERE "EmployeeId" IN (1, 2, 9, 10)
ORDER BY "EmployeeId"
""")
employees = cur.fetchall()
from datetime import datetime
expected = [
# Andrew Adams (ID 1) - Title changes to 'CEO', phone stays original, ReportsTo stays None
(1, 'Adams', 'Andrew', 'CEO', None, datetime(1962, 2, 18), datetime(2002, 8, 14),
'11120 Jasper Ave NW', 'Edmonton', 'AB', 'Canada', 'T5K 2N1', '+1 (780) 428-9482', '+1 (780) 428-3457', 'andrew@chinookcorp.com'),
# Nancy Edwards (ID 2) - Phone changes, title stays 'Sales Manager', ReportsTo stays 1
(2, 'Edwards', 'Nancy', 'Sales Manager', 1, datetime(1958, 12, 8), datetime(2002, 5, 1),
'825 8 Ave SW', 'Calgary', 'AB', 'Canada', 'T2P 2T3', '+1 (403) 555-9999', '+1 (403) 262-3322', 'nancy@chinookcorp.com'),
# Sarah Johnson - all new data, final ReportsTo = 1 (changed in step 4)
(9, 'Johnson', 'Sarah', 'Sales Support Agent', 1, datetime(1985, 3, 15), datetime(2009, 1, 10),
'123 Oak Street', 'Calgary', 'AB', 'Canada', 'T2P 5G3', '+1 (403) 555-0123', '+1 (403) 555-0124', 'sarah.johnson@chinookcorp.com'),
# Mike Chen - all new data, final ReportsTo = 1 (changed in step 4)
(10, 'Chen', 'Mike', 'Sales Support Agent', 1, datetime(1982, 8, 22), datetime(2009, 1, 10),
'456 Pine Ave', 'Calgary', 'AB', 'Canada', 'T2P 5G4', '+1 (403) 555-0125', '+1 (403) 555-0126', 'mike.chen@chinookcorp.com')
]
if len(employees) != 4:
print(f"❌ Expected 4 key employees, found {len(employees)}")
return False
# Full field comparison for all employees using rows_match
for actual, expected_emp in zip(employees, expected):
if not rows_match(actual, expected_emp):
print(f"❌ Employee {actual[0]} row mismatch: expected {expected_emp}, got {actual}")
return False
print("✅ Specific employee verification passed - all fields match exactly")
return True
def verify_customer_assignments(conn) -> bool:
"""Verify customer support representative assignments."""
with conn.cursor() as cur:
# Check customers 1, 2, 3 are assigned to Sarah (ID 9)
cur.execute("""
SELECT COUNT(*)
FROM "Customer"
WHERE "CustomerId" IN (1, 2, 3) AND "SupportRepId" = 9
""")
sarah_customers = cur.fetchone()[0]
if sarah_customers != 3:
print(f"❌ Expected 3 customers assigned to Sarah Johnson, got {sarah_customers}")
return False
# Check customers 4, 5, 6 are assigned to Mike (ID 10)
cur.execute("""
SELECT COUNT(*)
FROM "Customer"
WHERE "CustomerId" IN (4, 5, 6) AND "SupportRepId" = 10
""")
mike_customers = cur.fetchone()[0]
if mike_customers != 3:
print(f"❌ Expected 3 customers assigned to Mike Chen, got {mike_customers}")
return False
print("✅ Customer assignment verification passed")
return True
def verify_performance_table(conn) -> bool:
"""Verify the employee_performance table exists and has correct data."""
with conn.cursor() as cur:
try:
# Get all performance records
cur.execute("""
SELECT employee_id, customers_assigned, performance_score
FROM employee_performance
ORDER BY employee_id
""")
actual_results = cur.fetchall()
# Get actual customer counts for verification
cur.execute("""
SELECT "SupportRepId", COUNT(*)
FROM "Customer"
WHERE "SupportRepId" IN (9, 10)
GROUP BY "SupportRepId"
ORDER BY "SupportRepId"
""")
customer_counts = dict(cur.fetchall())
expected = [
(9, customer_counts.get(9, 0), Decimal('4.5')), # Sarah Johnson
(10, customer_counts.get(10, 0), Decimal('4.2')) # Mike Chen
]
if len(actual_results) != 2:
print(f"❌ Expected 2 performance records, got {len(actual_results)}")
return False
for actual, expected_row in zip(actual_results, expected):
if not rows_match(actual, expected_row):
print(f"❌ Performance record mismatch: expected {expected_row}, got {actual}")
return False
print("✅ Employee performance table verification passed")
return True
except psycopg2.Error as e:
print(f"❌ Employee performance table verification failed: {e}")
return False
def verify_employee_deletion_and_promotion(conn) -> bool:
"""Verify Robert King deletion and Laura Callahan promotion."""
with conn.cursor() as cur:
try:
# Verify Robert King (ID 7) is deleted
cur.execute("""
SELECT COUNT(*) FROM "Employee" WHERE "EmployeeId" = 7
""")
if cur.fetchone()[0] != 0:
print("❌ Robert King (EmployeeId = 7) should be deleted")
return False
# Verify Laura Callahan (ID 8) promotion
cur.execute("""
SELECT "Title" FROM "Employee" WHERE "EmployeeId" = 8
""")
laura_title = cur.fetchone()
if not laura_title or laura_title[0] != 'Senior IT Specialist':
print(f"❌ Laura Callahan should have title 'Senior IT Specialist', got: {laura_title[0] if laura_title else None}")
return False
print("✅ Employee deletion and promotion verification passed")
return True
except psycopg2.Error as e:
print(f"❌ Employee deletion/promotion verification failed: {e}")
return False
def verify_salary_column(conn) -> bool:
"""Verify salary column exists and has correct values."""
with conn.cursor() as cur:
try:
# Check if salary column exists and get all salary values
cur.execute("""
SELECT "EmployeeId", salary
FROM "Employee"
ORDER BY "EmployeeId"
""")
salary_data = cur.fetchall()
# Verify Laura (ID 8) has 75000.00, others have 50000.00
for emp_id, salary in salary_data:
expected_salary = Decimal('75000.00') if emp_id == 8 else Decimal('50000.00')
if salary != expected_salary:
print(f"❌ Employee {emp_id} salary should be {expected_salary}, got {salary}")
return False
print("✅ Salary column verification passed")
return True
except psycopg2.Error as e:
print(f"❌ Salary column verification failed: {e}")
return False
def main():
"""Main verification function."""
print("=" * 50)
print("Verifying Task 3: Employee Hierarchy Management")
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Run verification checks with short-circuit evaluation
success = (verify_employee_count_and_titles(conn) and
verify_specific_employees(conn) and
verify_customer_assignments(conn) and
verify_performance_table(conn) and
verify_employee_deletion_and_promotion(conn) and
verify_salary_column(conn))
conn.close()
if success:
print("\n🎉 Task verification: PASS")
print("All employee hierarchy management operations completed correctly!")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/chinook/sales_and_music_charts/description.md
================================================
Create a monthly sales dashboard and top music charts system for Chinook's management team to track business performance and identify trending music content.
## Your Tasks:
1. **Build the monthly sales summary table** — create a table called `monthly_sales_summary` in the `public` schema with:
* `year_month` (varchar) — format as 'YYYY-MM' (e.g., '2009-01')
* `total_invoices` (integer) — number of invoices in that month
* `total_revenue` (decimal) — sum of all invoice totals for the month
* `total_tracks_sold` (integer) — total quantity of individual tracks sold
* `average_invoice_value` (decimal) — average invoice amount for the month
* `unique_customers` (integer) — count of distinct customers who made purchases
2. **Create the music charts table** — build a table called `top_music_charts` in the `public` schema with:
* `chart_type` (varchar) — either 'top_tracks', 'top_albums', or 'top_artists'
* `rank_position` (integer) — ranking from 1 to 10
* `item_id` (integer) — ID of the track, album, or artist
* `item_name` (varchar) — name of the track, album, or artist
* `total_revenue` (decimal) — total revenue generated by this item
3. **Populate the monthly sales data**:
* Calculate metrics for each month that has invoice data
* Use invoice date to determine the month
* **Note**: Each invoice can contain multiple invoice lines (tracks)
4. **Generate the top 10 charts**:
* **Top Tracks**: Rank tracks by total quantity sold across all invoices
* **Top Albums**: Rank albums by total revenue generated from their tracks
* **Top Artists**: Rank artists by total revenue from all their tracks across all albums
5. **Business rules to follow**:
* Only include months where at least one invoice exists
* For album rankings, sum revenue from all tracks in each album
* For artist rankings, sum revenue from all tracks across all their albums
* Handle ties by using item name alphabetically as tiebreaker
* Exclude any items with zero sales
This system will provide clear, actionable business intelligence for monthly reporting and music trend analysis.
================================================
FILE: tasks/postgres/standard/chinook/sales_and_music_charts/meta.json
================================================
{
"task_id": "sales_and_music_charts",
"task_name": "Sales and Music Charts",
"category_id": "chinook",
"category_name": "Chinook",
"description": "Create monthly sales dashboard and top music charts system for tracking business performance and trending content.",
"author": "Lingxiao Du",
"created_at": "2025-08-12",
"difficulty": "L3",
"tags": [
"reporting and analytics",
"statistical aggregation",
"schema design"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"Album\" {\n \"AlbumId\" int4 [pk, not null]\n \"Title\" varchar(160) [not null]\n \"ArtistId\" int4 [not null]\n\n Indexes {\n ArtistId [type: btree, name: \"IFK_AlbumArtistId\"]\n }\n}\n\nTable \"Artist\" {\n \"ArtistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Customer\" {\n \"CustomerId\" int4 [pk, not null]\n \"FirstName\" varchar(40) [not null]\n \"LastName\" varchar(20) [not null]\n \"Company\" varchar(80)\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60) [not null]\n \"SupportRepId\" int4\n\n Indexes {\n SupportRepId [type: btree, name: \"IFK_CustomerSupportRepId\"]\n }\n}\n\nTable \"Employee\" {\n \"EmployeeId\" int4 [pk, not null]\n \"LastName\" varchar(20) [not null]\n \"FirstName\" varchar(20) [not null]\n \"Title\" varchar(30)\n \"ReportsTo\" int4\n \"BirthDate\" timestamp\n \"HireDate\" timestamp\n \"Address\" varchar(70)\n \"City\" varchar(40)\n \"State\" varchar(40)\n \"Country\" varchar(40)\n \"PostalCode\" varchar(10)\n \"Phone\" varchar(24)\n \"Fax\" varchar(24)\n \"Email\" varchar(60)\n\n Indexes {\n ReportsTo [type: btree, name: \"IFK_EmployeeReportsTo\"]\n }\n}\n\nTable \"Genre\" {\n \"GenreId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Invoice\" {\n \"InvoiceId\" int4 [pk, not null]\n \"CustomerId\" int4 [not null]\n \"InvoiceDate\" timestamp [not null]\n \"BillingAddress\" varchar(70)\n \"BillingCity\" varchar(40)\n \"BillingState\" varchar(40)\n \"BillingCountry\" varchar(40)\n \"BillingPostalCode\" varchar(10)\n \"Total\" numeric(10,2) [not null]\n\n Indexes {\n CustomerId [type: btree, name: \"IFK_InvoiceCustomerId\"]\n }\n}\n\nTable \"InvoiceLine\" {\n \"InvoiceLineId\" int4 [pk, not null]\n \"InvoiceId\" int4 [not null]\n \"TrackId\" int4 [not null]\n \"UnitPrice\" numeric(10,2) [not null]\n \"Quantity\" int4 [not null]\n\n Indexes {\n InvoiceId [type: btree, name: \"IFK_InvoiceLineInvoiceId\"]\n TrackId [type: btree, name: \"IFK_InvoiceLineTrackId\"]\n }\n}\n\nTable \"MediaType\" {\n \"MediaTypeId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"Playlist\" {\n \"PlaylistId\" int4 [pk, not null]\n \"Name\" varchar(120)\n}\n\nTable \"PlaylistTrack\" {\n \"PlaylistId\" int4 [not null]\n \"TrackId\" int4 [not null]\n\n Indexes {\n (PlaylistId, TrackId) [type: btree, name: \"PK_PlaylistTrack\"]\n TrackId [type: btree, name: \"IFK_PlaylistTrackTrackId\"]\n }\n}\n\nTable \"Track\" {\n \"TrackId\" int4 [pk, not null]\n \"Name\" varchar(200) [not null]\n \"AlbumId\" int4\n \"MediaTypeId\" int4 [not null]\n \"GenreId\" int4\n \"Composer\" varchar(220)\n \"Milliseconds\" int4 [not null]\n \"Bytes\" int4\n \"UnitPrice\" numeric(10,2) [not null]\n\n Indexes {\n AlbumId [type: btree, name: \"IFK_TrackAlbumId\"]\n GenreId [type: btree, name: \"IFK_TrackGenreId\"]\n MediaTypeId [type: btree, name: \"IFK_TrackMediaTypeId\"]\n }\n}\n\nRef \"FK_AlbumArtistId\":\"Artist\".\"ArtistId\" < \"Album\".\"ArtistId\"\n\nRef \"FK_CustomerSupportRepId\":\"Employee\".\"EmployeeId\" < \"Customer\".\"SupportRepId\"\n\nRef \"FK_EmployeeReportsTo\":\"Employee\".\"EmployeeId\" < \"Employee\".\"ReportsTo\"\n\nRef \"FK_InvoiceCustomerId\":\"Customer\".\"CustomerId\" < \"Invoice\".\"CustomerId\"\n\nRef \"FK_InvoiceLineInvoiceId\":\"Invoice\".\"InvoiceId\" < \"InvoiceLine\".\"InvoiceId\"\n\nRef \"FK_InvoiceLineTrackId\":\"Track\".\"TrackId\" < \"InvoiceLine\".\"TrackId\"\n\nRef \"FK_PlaylistTrackPlaylistId\":\"Playlist\".\"PlaylistId\" < \"PlaylistTrack\".\"PlaylistId\"\n\nRef \"FK_PlaylistTrackTrackId\":\"Track\".\"TrackId\" < \"PlaylistTrack\".\"TrackId\"\n\nRef \"FK_TrackAlbumId\":\"Album\".\"AlbumId\" < \"Track\".\"AlbumId\"\n\nRef \"FK_TrackGenreId\":\"Genre\".\"GenreId\" < \"Track\".\"GenreId\"\n\nRef \"FK_TrackMediaTypeId\":\"MediaType\".\"MediaTypeId\" < \"Track\".\"MediaTypeId\"\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/chinook.sql"
}
}
================================================
FILE: tasks/postgres/standard/chinook/sales_and_music_charts/verify.py
================================================
"""
Verification script for PostgreSQL Task 1: Monthly Sales Dashboard and Music Charts
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.01 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.01:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_monthly_sales_results(conn) -> bool:
"""Verify the monthly sales summary results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT year_month, total_invoices, total_revenue,
total_tracks_sold, average_invoice_value, unique_customers
FROM monthly_sales_summary
ORDER BY year_month
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH invoice_metrics AS (
SELECT
DATE_TRUNC('month', i."InvoiceDate") AS ym,
COUNT(*)::INT AS total_invoices,
SUM(i."Total")::DECIMAL AS total_revenue,
AVG(i."Total")::DECIMAL AS average_invoice_value,
COUNT(DISTINCT i."CustomerId")::INT AS unique_customers
FROM "Invoice" i
GROUP BY 1
),
track_metrics AS (
SELECT
DATE_TRUNC('month', i."InvoiceDate") AS ym,
SUM(il."Quantity")::INT AS total_tracks_sold
FROM "Invoice" i
JOIN "InvoiceLine" il ON il."InvoiceId" = i."InvoiceId"
WHERE il."Quantity" > 0
GROUP BY 1
)
SELECT
TO_CHAR(im.ym, 'YYYY-MM') AS year_month,
im.total_invoices,
im.total_revenue,
COALESCE(tm.total_tracks_sold, 0) AS total_tracks_sold,
im.average_invoice_value,
im.unique_customers
FROM invoice_metrics im
LEFT JOIN track_metrics tm USING (ym)
ORDER BY year_month;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} monthly sales records, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Monthly sales row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total monthly sales mismatches: {mismatches}")
return False
print(f"✅ Monthly sales results are correct ({len(actual_results)} records)")
return True
def verify_music_charts_results(conn) -> bool:
"""Verify the music charts results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT chart_type, rank_position, item_id, item_name, total_revenue
FROM top_music_charts
ORDER BY chart_type, rank_position
""")
actual_results = cur.fetchall()
# Execute ground truth queries for each chart type
cur.execute("""
WITH track_stats AS (
SELECT
'top_tracks'::varchar AS chart_type,
t."TrackId" AS item_id,
t."Name" AS item_name,
SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue,
SUM(il."Quantity")::INT AS total_quantity
FROM "Track" t
JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId"
GROUP BY t."TrackId", t."Name"
HAVING SUM(il."Quantity") > 0
),
track_ranked AS (
SELECT
chart_type, item_id, item_name, total_revenue,
ROW_NUMBER() OVER (ORDER BY total_quantity DESC, item_name, item_id) AS rank_position
FROM track_stats
),
album_rev AS (
SELECT
'top_albums'::varchar AS chart_type,
a."AlbumId" AS item_id,
a."Title" AS item_name,
SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue
FROM "Album" a
JOIN "Track" t ON t."AlbumId" = a."AlbumId"
JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId"
GROUP BY a."AlbumId", a."Title"
HAVING SUM(il."UnitPrice" * il."Quantity") > 0
),
album_ranked AS (
SELECT
chart_type, item_id, item_name, total_revenue,
ROW_NUMBER() OVER (ORDER BY total_revenue DESC, item_name, item_id) AS rank_position
FROM album_rev
),
artist_rev AS (
SELECT
'top_artists'::varchar AS chart_type,
ar."ArtistId" AS item_id,
ar."Name" AS item_name,
SUM(il."UnitPrice" * il."Quantity")::DECIMAL AS total_revenue
FROM "Artist" ar
JOIN "Album" a ON a."ArtistId" = ar."ArtistId"
JOIN "Track" t ON t."AlbumId" = a."AlbumId"
JOIN "InvoiceLine" il ON il."TrackId" = t."TrackId"
GROUP BY ar."ArtistId", ar."Name"
HAVING SUM(il."UnitPrice" * il."Quantity") > 0
),
artist_ranked AS (
SELECT
chart_type, item_id, item_name, total_revenue,
ROW_NUMBER() OVER (ORDER BY total_revenue DESC, item_name, item_id) AS rank_position
FROM artist_rev
)
SELECT chart_type, rank_position, item_id, item_name, total_revenue
FROM (
SELECT * FROM track_ranked WHERE rank_position <= 10
UNION ALL
SELECT * FROM album_ranked WHERE rank_position <= 10
UNION ALL
SELECT * FROM artist_ranked WHERE rank_position <= 10
) x
ORDER BY chart_type, rank_position;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} music chart records, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Music chart row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total music chart mismatches: {mismatches}")
return False
print(f"✅ Music chart results are correct ({len(actual_results)} records)")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify results
success = verify_monthly_sales_results(conn) and verify_music_charts_results(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/description.md
================================================
Fix the customer analysis query that is producing incorrect results.
## Background
The data analytics team attempted to create a customer behavior analysis query to identify active customers and analyze their spending patterns and preferences. The requirements are:
- Only count rentals that have associated payment records (paid rentals)
- Only include customers with at least 15 paid rentals
- Only include customers with valid email addresses
However, they're getting incorrect results - the query is over-counting rentals and calculating wrong spending amounts.
Your task is to fix this query to produce accurate results.
## The Problematic Query
Here's the buggy query that needs to be fixed:
```sql
WITH customer_basic_stats AS (
SELECT
c.customer_id,
c.first_name || ' ' || c.last_name as customer_name,
ci.city as customer_city,
co.country as customer_country,
COUNT(r.rental_id) as total_rentals,
COUNT(DISTINCT i.film_id) as unique_films,
SUM(p.amount) as total_spent,
AVG(EXTRACT(days FROM (r.return_date - r.rental_date))) as avg_rental_duration
FROM customer c
JOIN address a ON c.address_id = a.address_id
JOIN city ci ON a.city_id = ci.city_id
JOIN country co ON ci.country_id = co.country_id
JOIN rental r ON c.customer_id = r.customer_id
JOIN inventory i ON r.inventory_id = i.inventory_id
JOIN payment p ON r.rental_id = p.rental_id
WHERE c.email IS NOT NULL
GROUP BY c.customer_id, c.first_name, c.last_name, ci.city, co.country
HAVING COUNT(r.rental_id) >= 15
),
customer_categories AS (
SELECT
c.customer_id,
cat.name as category_name,
COUNT(*) as category_count,
ROW_NUMBER() OVER (PARTITION BY c.customer_id ORDER BY COUNT(*) DESC, cat.name ASC) as rn
FROM customer c
JOIN rental r ON c.customer_id = r.customer_id
JOIN inventory i ON r.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
JOIN film_category fc ON f.film_id = fc.film_id
JOIN category cat ON fc.category_id = cat.category_id
JOIN payment p ON r.rental_id = p.rental_id
WHERE c.email IS NOT NULL
GROUP BY c.customer_id, cat.name
),
customer_actors AS (
SELECT
c.customer_id,
a.first_name || ' ' || a.last_name as actor_name,
COUNT(*) as actor_count,
ROW_NUMBER() OVER (PARTITION BY c.customer_id ORDER BY COUNT(*) DESC, (a.first_name || ' ' || a.last_name) ASC) as rn
FROM customer c
JOIN rental r ON c.customer_id = r.customer_id
JOIN inventory i ON r.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
JOIN film_actor fa ON f.film_id = fa.film_id
JOIN actor a ON fa.actor_id = a.actor_id
JOIN payment p ON r.rental_id = p.rental_id
WHERE c.email IS NOT NULL
GROUP BY c.customer_id, a.first_name, a.last_name
),
regional_popular_films AS (
SELECT
co.country,
f.title,
COUNT(*) as rental_count,
ROW_NUMBER() OVER (PARTITION BY co.country ORDER BY COUNT(*) DESC, f.title ASC) as rn
FROM rental r
JOIN inventory i ON r.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
JOIN customer c ON r.customer_id = c.customer_id
JOIN address a ON c.address_id = a.address_id
JOIN city ci ON a.city_id = ci.city_id
JOIN country co ON ci.country_id = co.country_id
JOIN payment p ON r.rental_id = p.rental_id
WHERE c.email IS NOT NULL
GROUP BY co.country, f.title
)
SELECT
cbs.customer_id,
cbs.customer_name,
cbs.customer_city,
cbs.customer_country,
cbs.total_rentals,
cbs.unique_films,
cbs.total_spent,
cc.category_name as favorite_category,
ca.actor_name as favorite_actor,
cbs.avg_rental_duration,
CASE
WHEN cbs.total_spent >= 150 THEN 'Premium'
WHEN cbs.total_spent >= 75 THEN 'Standard'
ELSE 'Basic'
END as customer_tier,
rpf.title as most_popular_film_in_region,
rpf.rental_count as regional_film_rental_count
FROM customer_basic_stats cbs
LEFT JOIN customer_categories cc ON cbs.customer_id = cc.customer_id AND cc.rn = 1
LEFT JOIN customer_actors ca ON cbs.customer_id = ca.customer_id AND ca.rn = 1
LEFT JOIN regional_popular_films rpf ON cbs.customer_country = rpf.country AND rpf.rn = 1
ORDER BY cbs.total_spent DESC, cbs.total_rentals DESC, cbs.customer_name ASC;
```
## Known Issues
When comparing the problematic query results with the expected correct values, the following discrepancies are observed:
1. **Rental count discrepancies**: Many customers show higher `total_rentals` counts than expected
2. **Spending amount errors**: The `total_spent` values don't match the correct calculations
3. **Incorrect favorite categories and actors**: Many customers show wrong favorite categories and actors compared to the expected results
4. **Time calculation inconsistencies**: The `avg_rental_duration` values differ significantly from the correct calculations
- Example: Customer ID 1 shows 3.90 days instead of the expected 4.27 days
- Example: Customer ID 2 shows 5.23 days instead of the expected 5.69 days
## Your Task
Debug and fix the query to produce accurate results. Then create a table with your corrected results.
1. **Fix the query** to ensure:
- Accurate customer spending and rental counts
- Correct favorite categories and actors
- Proper regional popular films
2. **Create a table** called `customer_analysis_fixed` in the `public` schema with your corrected query results. The table should have the same columns as the original query output.
**Important**: The business logic and output columns should remain the same - only fix the data accuracy issues.
================================================
FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/meta.json
================================================
{
"task_id": "customer_analysis_fix",
"task_name": "Customer Analysis Fix",
"category_id": "dvdrental",
"category_name": "DVD Rental",
"description": "Debug and fix customer behavior analysis query producing incorrect rental counts and spending calculations.",
"author": "Lingxiao Du",
"created_at": "2025-08-20",
"difficulty": "L3",
"tags": [
"performance optimization",
"data integrity enforcement"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"mpaa_rating\" {\n \"G\"\n \"PG\"\n \"PG-13\"\n \"R\"\n \"NC-17\"\n}\n\nTable \"customer\" {\n \"customer_id\" int4 [pk, not null, increment]\n \"store_id\" int2 [not null]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"email\" varchar(50)\n \"address_id\" int2 [not null]\n \"activebool\" bool [not null, default: true]\n \"create_date\" date [not null, default: `('now'::text)::date`]\n \"last_update\" timestamp [default: `now()`]\n \"active\" int4\n\n Indexes {\n address_id [type: btree, name: \"idx_fk_address_id\"]\n store_id [type: btree, name: \"idx_fk_store_id\"]\n last_name [type: btree, name: \"idx_last_name\"]\n }\n}\n\nTable \"actor\" {\n \"actor_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n last_name [type: btree, name: \"idx_actor_last_name\"]\n }\n}\n\nTable \"category\" {\n \"category_id\" int4 [pk, not null, increment]\n \"name\" varchar(25) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n \"film_id\" int4 [pk, not null, increment]\n \"title\" varchar(255) [not null]\n \"description\" text\n \"release_year\" int4\n \"language_id\" int2 [not null]\n \"rental_duration\" int2 [not null, default: 3]\n \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n \"length\" int2\n \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n \"rating\" mpaa_rating [default: 'G']\n \"last_update\" timestamp [not null, default: `now()`]\n \"special_features\" \"text[]\"\n \"fulltext\" tsvector [not null]\n\n Indexes {\n fulltext [type: gist, name: \"film_fulltext_idx\"]\n language_id [type: btree, name: \"idx_fk_language_id\"]\n title [type: btree, name: \"idx_title\"]\n }\n}\n\nTable \"film_actor\" {\n \"actor_id\" int2 [not null]\n \"film_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n film_id [type: btree, name: \"idx_fk_film_id\"]\n }\n}\n\nTable \"film_category\" {\n \"film_id\" int2 [not null]\n \"category_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n }\n}\n\nTable \"address\" {\n \"address_id\" int4 [pk, not null, increment]\n \"address\" varchar(50) [not null]\n \"address2\" varchar(50)\n \"district\" varchar(20) [not null]\n \"city_id\" int2 [not null]\n \"postal_code\" varchar(10)\n \"phone\" varchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n city_id [type: btree, name: \"idx_fk_city_id\"]\n }\n}\n\nTable \"city\" {\n \"city_id\" int4 [pk, not null, increment]\n \"city\" varchar(50) [not null]\n \"country_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n country_id [type: btree, name: \"idx_fk_country_id\"]\n }\n}\n\nTable \"country\" {\n \"country_id\" int4 [pk, not null, increment]\n \"country\" varchar(50) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n \"inventory_id\" int4 [pk, not null, increment]\n \"film_id\" int2 [not null]\n \"store_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n }\n}\n\nTable \"language\" {\n \"language_id\" int4 [pk, not null, increment]\n \"name\" bpchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n \"payment_id\" int4 [pk, not null, increment]\n \"customer_id\" int2 [not null]\n \"staff_id\" int2 [not null]\n \"rental_id\" int4 [not null]\n \"amount\" numeric(5,2) [not null]\n \"payment_date\" timestamp [not null]\n\n Indexes {\n rental_id [type: btree, name: \"idx_fk_rental_id\"]\n staff_id [type: btree, name: \"idx_fk_staff_id\"]\n }\n}\n\nTable \"rental\" {\n \"rental_id\" int4 [pk, not null, increment]\n \"rental_date\" timestamp [not null]\n \"inventory_id\" int4 [not null]\n \"customer_id\" int2 [not null]\n \"return_date\" timestamp\n \"staff_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n }\n}\n\nTable \"staff\" {\n \"staff_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"address_id\" int2 [not null]\n \"email\" varchar(50)\n \"store_id\" int2 [not null]\n \"active\" bool [not null, default: true]\n \"username\" varchar(16) [not null]\n \"password\" varchar(40)\n \"last_update\" timestamp [not null, default: `now()`]\n \"picture\" bytea\n}\n\nTable \"store\" {\n \"store_id\" int4 [pk, not null, increment]\n \"manager_staff_id\" int2 [unique, not null]\n \"address_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project"
}
}
================================================
FILE: tasks/postgres/standard/dvdrental/customer_analysis_fix/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Fix Customer Analysis Query
"""
import os
import sys
import psycopg2
from decimal import Decimal
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def rows_match(actual_row, expected_row):
"""Compare two rows with appropriate tolerance for decimals and floats."""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)):
# Use higher tolerance for floating point comparisons
if abs(float(actual) - float(expected)) > 0.1:
return False
elif actual != expected:
return False
return True
def verify_customer_analysis_fixed_table(conn) -> bool:
"""Verify the customer_analysis_fixed table results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT customer_id, customer_name, customer_city, customer_country,
total_rentals, unique_films, total_spent, favorite_category,
favorite_actor, avg_rental_duration, customer_tier,
most_popular_film_in_region, regional_film_rental_count
FROM customer_analysis_fixed
ORDER BY total_spent DESC, total_rentals DESC, customer_name ASC
""")
actual_results = cur.fetchall()
# Execute ground truth query (the corrected version)
cur.execute("""
WITH paid_rentals AS (
SELECT DISTINCT
r.rental_id,
r.customer_id,
r.inventory_id,
r.rental_date,
r.return_date
FROM rental r
JOIN payment p ON p.rental_id = r.rental_id
),
payments_by_customer AS (
SELECT pr.customer_id, SUM(p.amount) AS total_spent
FROM paid_rentals pr
JOIN payment p ON p.rental_id = pr.rental_id
GROUP BY pr.customer_id
),
customer_basic_stats AS (
SELECT
c.customer_id,
c.first_name || ' ' || c.last_name AS customer_name,
ci.city AS customer_city,
co.country AS customer_country,
COUNT(DISTINCT pr.rental_id) AS total_rentals,
COUNT(DISTINCT i.film_id) AS unique_films,
pbc.total_spent,
AVG(EXTRACT(EPOCH FROM (pr.return_date - pr.rental_date)) / 86400.0) AS avg_rental_duration
FROM customer c
JOIN address a ON c.address_id = a.address_id
JOIN city ci ON a.city_id = ci.city_id
JOIN country co ON ci.country_id = co.country_id
JOIN paid_rentals pr ON pr.customer_id = c.customer_id
JOIN inventory i ON pr.inventory_id = i.inventory_id
JOIN payments_by_customer pbc ON pbc.customer_id = c.customer_id
WHERE c.email IS NOT NULL
GROUP BY c.customer_id, c.first_name, c.last_name, ci.city, co.country, pbc.total_spent
HAVING COUNT(DISTINCT pr.rental_id) >= 15
),
customer_categories AS (
SELECT
pr.customer_id,
cat.name AS category_name,
COUNT(*) AS category_count,
ROW_NUMBER() OVER (
PARTITION BY pr.customer_id
ORDER BY COUNT(*) DESC, cat.name ASC
) AS rn
FROM paid_rentals pr
JOIN inventory i ON pr.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
JOIN film_category fc ON f.film_id = fc.film_id
JOIN category cat ON fc.category_id = cat.category_id
JOIN customer c ON pr.customer_id = c.customer_id
WHERE c.email IS NOT NULL
GROUP BY pr.customer_id, cat.name
),
customer_actors AS (
SELECT
pr.customer_id,
(a.first_name || ' ' || a.last_name) AS actor_name,
COUNT(*) AS actor_count,
ROW_NUMBER() OVER (
PARTITION BY pr.customer_id
ORDER BY COUNT(*) DESC, (a.first_name || ' ' || a.last_name) ASC
) AS rn
FROM paid_rentals pr
JOIN inventory i ON pr.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
JOIN film_actor fa ON f.film_id = fa.film_id
JOIN actor a ON fa.actor_id = a.actor_id
JOIN customer c ON pr.customer_id = c.customer_id
WHERE c.email IS NOT NULL
GROUP BY pr.customer_id, a.first_name, a.last_name
),
regional_popular_films AS (
SELECT
co.country,
f.title,
COUNT(DISTINCT pr.rental_id) AS rental_count,
ROW_NUMBER() OVER (
PARTITION BY co.country
ORDER BY COUNT(DISTINCT pr.rental_id) DESC, f.title ASC
) AS rn
FROM paid_rentals pr
JOIN customer c ON pr.customer_id = c.customer_id
JOIN address a ON c.address_id = a.address_id
JOIN city ci ON a.city_id = ci.city_id
JOIN country co ON ci.country_id = co.country_id
JOIN inventory i ON pr.inventory_id = i.inventory_id
JOIN film f ON i.film_id = f.film_id
WHERE c.email IS NOT NULL
GROUP BY co.country, f.title
)
SELECT
cbs.customer_id,
cbs.customer_name,
cbs.customer_city,
cbs.customer_country,
cbs.total_rentals,
cbs.unique_films,
cbs.total_spent,
cc.category_name AS favorite_category,
ca.actor_name AS favorite_actor,
cbs.avg_rental_duration,
CASE
WHEN cbs.total_spent >= 150 THEN 'Premium'
WHEN cbs.total_spent >= 75 THEN 'Standard'
ELSE 'Basic'
END AS customer_tier,
rpf.title AS most_popular_film_in_region,
rpf.rental_count AS regional_film_rental_count
FROM customer_basic_stats cbs
LEFT JOIN customer_categories cc
ON cbs.customer_id = cc.customer_id AND cc.rn = 1
LEFT JOIN customer_actors ca
ON cbs.customer_id = ca.customer_id AND ca.rn = 1
LEFT JOIN regional_popular_films rpf
ON cbs.customer_country = rpf.country AND rpf.rn = 1
ORDER BY cbs.total_spent DESC, cbs.total_rentals DESC, cbs.customer_name ASC;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} rows, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch:")
print(f" Expected: {expected}")
print(f" Actual: {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Query results are correct ({len(actual_results)} rows)")
return True
def main():
"""Main verification function."""
print("=" * 70)
print("PostgreSQL Task 3 Verification: Fix Customer Analysis Query")
print("=" * 70)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify results
success = verify_customer_analysis_fixed_table(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
print(" - Query was successfully debugged and fixed")
print(" - All 587 rows match the expected results")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
print(" - The query still has issues")
print(" - Please review the duplicate counting problem")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/description.md
================================================
Optimize slow customer analytics query in the DVD rental database.
## Background
The business intelligence team is running customer analytics reports, but one of their critical queries has become extremely slow. The query that used to run in milliseconds is now taking over a second to complete, causing timeout issues in their reporting dashboard.
## Your Task
Analyze and optimize the performance of this customer analytics query:
```sql
SELECT
c.customer_id,
c.first_name,
c.last_name,
c.email,
COUNT(DISTINCT p.payment_id) as total_payments,
SUM(p.amount) as total_spent,
AVG(p.amount) as avg_payment,
COUNT(DISTINCT EXTRACT(month FROM p.payment_date)) as active_months,
MAX(p.payment_date) as last_payment,
MIN(p.payment_date) as first_payment,
(SELECT COUNT(*) FROM payment p2 WHERE p2.customer_id = c.customer_id AND p2.amount > 5.0) as high_value_payments,
(SELECT SUM(amount) FROM payment p3 WHERE p3.customer_id = c.customer_id AND p3.payment_date >= '2007-03-01') as recent_spending
FROM customer c
JOIN payment p ON c.customer_id = p.customer_id
WHERE c.active = 1
GROUP BY c.customer_id, c.first_name, c.last_name, c.email
HAVING COUNT(p.payment_id) >= 10
ORDER BY total_spent DESC, total_payments DESC;
```
The query is currently taking over 1000ms to execute and has a very high cost in the execution plan. The team needs this optimized urgently as it's blocking their daily reporting processes.
## Requirements
- Use `EXPLAIN ANALYZE` to identify performance bottlenecks
- Implement appropriate database optimizations
- Ensure queries return accurate results after optimization
- Document your optimization approach and performance improvements
================================================
FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/meta.json
================================================
{
"task_id": "customer_analytics_optimization",
"task_name": "Customer Analytics Optimization",
"category_id": "dvdrental",
"category_name": "DVD Rental",
"description": "Optimize slow customer analytics query with correlated subqueries causing timeout issues in reporting dashboard.",
"author": "Lingxiao Du",
"created_at": "2025-08-20",
"difficulty": "L3",
"tags": [
"performance optimization"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"mpaa_rating\" {\n \"G\"\n \"PG\"\n \"PG-13\"\n \"R\"\n \"NC-17\"\n}\n\nTable \"customer\" {\n \"customer_id\" int4 [pk, not null, increment]\n \"store_id\" int2 [not null]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"email\" varchar(50)\n \"address_id\" int2 [not null]\n \"activebool\" bool [not null, default: true]\n \"create_date\" date [not null, default: `('now'::text)::date`]\n \"last_update\" timestamp [default: `now()`]\n \"active\" int4\n\n Indexes {\n address_id [type: btree, name: \"idx_fk_address_id\"]\n store_id [type: btree, name: \"idx_fk_store_id\"]\n last_name [type: btree, name: \"idx_last_name\"]\n }\n}\n\nTable \"actor\" {\n \"actor_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n last_name [type: btree, name: \"idx_actor_last_name\"]\n }\n}\n\nTable \"category\" {\n \"category_id\" int4 [pk, not null, increment]\n \"name\" varchar(25) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n \"film_id\" int4 [pk, not null, increment]\n \"title\" varchar(255) [not null]\n \"description\" text\n \"release_year\" int4\n \"language_id\" int2 [not null]\n \"rental_duration\" int2 [not null, default: 3]\n \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n \"length\" int2\n \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n \"rating\" mpaa_rating [default: 'G']\n \"last_update\" timestamp [not null, default: `now()`]\n \"special_features\" \"text[]\"\n \"fulltext\" tsvector [not null]\n\n Indexes {\n fulltext [type: gist, name: \"film_fulltext_idx\"]\n language_id [type: btree, name: \"idx_fk_language_id\"]\n title [type: btree, name: \"idx_title\"]\n }\n}\n\nTable \"film_actor\" {\n \"actor_id\" int2 [not null]\n \"film_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n film_id [type: btree, name: \"idx_fk_film_id\"]\n }\n}\n\nTable \"film_category\" {\n \"film_id\" int2 [not null]\n \"category_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n }\n}\n\nTable \"address\" {\n \"address_id\" int4 [pk, not null, increment]\n \"address\" varchar(50) [not null]\n \"address2\" varchar(50)\n \"district\" varchar(20) [not null]\n \"city_id\" int2 [not null]\n \"postal_code\" varchar(10)\n \"phone\" varchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n city_id [type: btree, name: \"idx_fk_city_id\"]\n }\n}\n\nTable \"city\" {\n \"city_id\" int4 [pk, not null, increment]\n \"city\" varchar(50) [not null]\n \"country_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n country_id [type: btree, name: \"idx_fk_country_id\"]\n }\n}\n\nTable \"country\" {\n \"country_id\" int4 [pk, not null, increment]\n \"country\" varchar(50) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n \"inventory_id\" int4 [pk, not null, increment]\n \"film_id\" int2 [not null]\n \"store_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n }\n}\n\nTable \"language\" {\n \"language_id\" int4 [pk, not null, increment]\n \"name\" bpchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n \"payment_id\" int4 [pk, not null, increment]\n \"customer_id\" int2 [not null]\n \"staff_id\" int2 [not null]\n \"rental_id\" int4 [not null]\n \"amount\" numeric(5,2) [not null]\n \"payment_date\" timestamp [not null]\n\n Indexes {\n rental_id [type: btree, name: \"idx_fk_rental_id\"]\n staff_id [type: btree, name: \"idx_fk_staff_id\"]\n }\n}\n\nTable \"rental\" {\n \"rental_id\" int4 [pk, not null, increment]\n \"rental_date\" timestamp [not null]\n \"inventory_id\" int4 [not null]\n \"customer_id\" int2 [not null]\n \"return_date\" timestamp\n \"staff_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n }\n}\n\nTable \"staff\" {\n \"staff_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"address_id\" int2 [not null]\n \"email\" varchar(50)\n \"store_id\" int2 [not null]\n \"active\" bool [not null, default: true]\n \"username\" varchar(16) [not null]\n \"password\" varchar(40)\n \"last_update\" timestamp [not null, default: `now()`]\n \"picture\" bytea\n}\n\nTable \"store\" {\n \"store_id\" int4 [pk, not null, increment]\n \"manager_staff_id\" int2 [unique, not null]\n \"address_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project"
}
}
================================================
FILE: tasks/postgres/standard/dvdrental/customer_analytics_optimization/verify.py
================================================
"""
Verification script for PostgreSQL Task 1: Customer Payment Query Optimization
"""
import os
import sys
import psycopg2
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def check_payment_customer_id_index(conn) -> bool:
"""Check if there's any index on payment.customer_id column."""
with conn.cursor() as cur:
cur.execute("""
SELECT indexname, indexdef
FROM pg_indexes
WHERE schemaname = 'public'
AND tablename = 'payment'
AND indexdef LIKE '%customer_id%'
""")
indexes = cur.fetchall()
print(indexes)
return len(indexes) > 0, indexes
def main():
"""Main verification function."""
print("=" * 60)
print("PostgreSQL Task 1 Verification: Customer Payment Query Optimization")
print("=" * 60)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
print("\n🔍 Checking for customer_id index on payment table...")
# Check if any index exists on payment.customer_id
has_index, indexes = check_payment_customer_id_index(conn)
if has_index:
print("✅ Found index(es) on payment.customer_id:")
for index_name, index_def in indexes:
print(f" - {index_name}: {index_def}")
else:
print("❌ No index found on payment.customer_id column")
conn.close()
if has_index:
print(f"\n🎉 Task verification: PASS")
print(f" - Index on payment.customer_id exists")
sys.exit(0)
else:
print(f"\n❌ Task verification: FAIL")
print(f" - No index found on payment.customer_id")
print(f" - Create an index on payment(customer_id) to optimize the queries")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/dvdrental/film_inventory_management/description.md
================================================
Manage film inventory operations in the DVD rental database.
## Background
You are the database administrator for the DVD rental store. The store manager has requested several database operations to manage the film inventory. You need to perform multiple operations including adding new films, updating inventory, querying available films, and cleaning up old records.
## Your Task
Complete the following database operations in sequence:
### 1. Add New Films
Add these two new films to the database:
- **Film 1**: Title "Data Science Adventures", Description "A thrilling journey through machine learning algorithms", Release Year 2024, Language ID 1, Rental Duration 5 days, Rental Rate $3.99, Length 120 minutes, Replacement Cost $15.99, Rating 'PG-13'
- **Film 2**: Title "Cloud Computing Chronicles", Description "Exploring the world of distributed systems", Release Year 2024, Language ID 1, Rental Duration 7 days, Rental Rate $4.99, Length 135 minutes, Replacement Cost $18.99, Rating 'PG'
### 2. Add Inventory Records
For each new film, add 3 inventory records for store_id = 1 and 2 inventory records for store_id = 2.
### 3. Update Film Information
Update the rental_rate of all films with rating 'PG-13' to increase by 10% (multiply by 1.1).
### 4. Create Available Films Table
Create a table called `available_films` with the following structure:
- `film_id` (INTEGER, PRIMARY KEY)
- `title` (VARCHAR(255), NOT NULL)
- `rental_rate` (NUMERIC(4,2), NOT NULL)
- `length` (SMALLINT)
Populate this table with films that meet these criteria:
- Have rental_rate between $3.00 and $5.00
- Have length greater than 100 minutes
- Are available in store_id = 1 (have at least 1 inventory record)
### 5. Clean Up Inventory
Delete inventory records for films that meet ALL of the following criteria:
- Have a replacement_cost greater than $25.00
- AND have rental_rate less than $1.00
- AND have no rental history (no records in the rental table)
### 6. Create Summary Report Table
Create a table called `film_inventory_summary` with the following structure:
- `title` (VARCHAR(255), NOT NULL)
- `rental_rate` (NUMERIC(4,2), NOT NULL)
- `total_inventory` (INTEGER, NOT NULL)
- `store1_count` (INTEGER, NOT NULL)
- `store2_count` (INTEGER, NOT NULL)
Populate this table with a summary query that shows:
- Film title
- Current rental rate (after any updates from step 3)
- Total count of inventory records across all stores
- Count of inventory records in store_id = 1
- Count of inventory records in store_id = 2
Requirements for the summary report:
- Include only films that currently have at least one inventory record
- Insert the results sorted by inventory count from highest to lowest, and then alphabetically by film title
- Ensure all counts reflect the state after completing the previous operations
## Requirements
- Complete all operations in the specified sequence
- Ensure data integrity throughout all operations
- Verify that your operations affect the expected number of records
- Handle any constraint violations appropriately
================================================
FILE: tasks/postgres/standard/dvdrental/film_inventory_management/meta.json
================================================
{
"task_id": "film_inventory_management",
"task_name": "Film Inventory Management",
"category_id": "dvdrental",
"category_name": "DVD Rental",
"description": "Manage film inventory through multiple operations including adding films, updating records, and cleaning old data.",
"author": "Lingxiao Du",
"created_at": "2025-08-20",
"difficulty": "L3",
"tags": [
"data migration",
"transactional operations",
"schema design"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"mpaa_rating\" {\n \"G\"\n \"PG\"\n \"PG-13\"\n \"R\"\n \"NC-17\"\n}\n\nTable \"customer\" {\n \"customer_id\" int4 [pk, not null, increment]\n \"store_id\" int2 [not null]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"email\" varchar(50)\n \"address_id\" int2 [not null]\n \"activebool\" bool [not null, default: true]\n \"create_date\" date [not null, default: `('now'::text)::date`]\n \"last_update\" timestamp [default: `now()`]\n \"active\" int4\n\n Indexes {\n address_id [type: btree, name: \"idx_fk_address_id\"]\n store_id [type: btree, name: \"idx_fk_store_id\"]\n last_name [type: btree, name: \"idx_last_name\"]\n }\n}\n\nTable \"actor\" {\n \"actor_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n last_name [type: btree, name: \"idx_actor_last_name\"]\n }\n}\n\nTable \"category\" {\n \"category_id\" int4 [pk, not null, increment]\n \"name\" varchar(25) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"film\" {\n \"film_id\" int4 [pk, not null, increment]\n \"title\" varchar(255) [not null]\n \"description\" text\n \"release_year\" int4\n \"language_id\" int2 [not null]\n \"rental_duration\" int2 [not null, default: 3]\n \"rental_rate\" numeric(4,2) [not null, default: 4.99]\n \"length\" int2\n \"replacement_cost\" numeric(5,2) [not null, default: 19.99]\n \"rating\" mpaa_rating [default: 'G']\n \"last_update\" timestamp [not null, default: `now()`]\n \"special_features\" \"text[]\"\n \"fulltext\" tsvector [not null]\n\n Indexes {\n fulltext [type: gist, name: \"film_fulltext_idx\"]\n language_id [type: btree, name: \"idx_fk_language_id\"]\n title [type: btree, name: \"idx_title\"]\n }\n}\n\nTable \"film_actor\" {\n \"actor_id\" int2 [not null]\n \"film_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (actor_id, film_id) [type: btree, name: \"film_actor_pkey\"]\n film_id [type: btree, name: \"idx_fk_film_id\"]\n }\n}\n\nTable \"film_category\" {\n \"film_id\" int2 [not null]\n \"category_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (film_id, category_id) [type: btree, name: \"film_category_pkey\"]\n }\n}\n\nTable \"address\" {\n \"address_id\" int4 [pk, not null, increment]\n \"address\" varchar(50) [not null]\n \"address2\" varchar(50)\n \"district\" varchar(20) [not null]\n \"city_id\" int2 [not null]\n \"postal_code\" varchar(10)\n \"phone\" varchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n city_id [type: btree, name: \"idx_fk_city_id\"]\n }\n}\n\nTable \"city\" {\n \"city_id\" int4 [pk, not null, increment]\n \"city\" varchar(50) [not null]\n \"country_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n country_id [type: btree, name: \"idx_fk_country_id\"]\n }\n}\n\nTable \"country\" {\n \"country_id\" int4 [pk, not null, increment]\n \"country\" varchar(50) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"inventory\" {\n \"inventory_id\" int4 [pk, not null, increment]\n \"film_id\" int2 [not null]\n \"store_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (store_id, film_id) [type: btree, name: \"idx_store_id_film_id\"]\n }\n}\n\nTable \"language\" {\n \"language_id\" int4 [pk, not null, increment]\n \"name\" bpchar(20) [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nTable \"payment\" {\n \"payment_id\" int4 [pk, not null, increment]\n \"customer_id\" int2 [not null]\n \"staff_id\" int2 [not null]\n \"rental_id\" int4 [not null]\n \"amount\" numeric(5,2) [not null]\n \"payment_date\" timestamp [not null]\n\n Indexes {\n rental_id [type: btree, name: \"idx_fk_rental_id\"]\n staff_id [type: btree, name: \"idx_fk_staff_id\"]\n }\n}\n\nTable \"rental\" {\n \"rental_id\" int4 [pk, not null, increment]\n \"rental_date\" timestamp [not null]\n \"inventory_id\" int4 [not null]\n \"customer_id\" int2 [not null]\n \"return_date\" timestamp\n \"staff_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n\n Indexes {\n (rental_date, inventory_id, customer_id) [type: btree, name: \"idx_unq_rental_rental_date_inventory_id_customer_id\"]\n inventory_id [type: btree, name: \"idx_fk_inventory_id\"]\n }\n}\n\nTable \"staff\" {\n \"staff_id\" int4 [pk, not null, increment]\n \"first_name\" varchar(45) [not null]\n \"last_name\" varchar(45) [not null]\n \"address_id\" int2 [not null]\n \"email\" varchar(50)\n \"store_id\" int2 [not null]\n \"active\" bool [not null, default: true]\n \"username\" varchar(16) [not null]\n \"password\" varchar(40)\n \"last_update\" timestamp [not null, default: `now()`]\n \"picture\" bytea\n}\n\nTable \"store\" {\n \"store_id\" int4 [pk, not null, increment]\n \"manager_staff_id\" int2 [unique, not null]\n \"address_id\" int2 [not null]\n \"last_update\" timestamp [not null, default: `now()`]\n}\n\nRef \"fk_address_city\":\"city\".\"city_id\" < \"address\".\"city_id\"\n\nRef \"fk_city\":\"country\".\"country_id\" < \"city\".\"country_id\"\n\nRef \"customer_address_id_fkey\":\"address\".\"address_id\" < \"customer\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"film_language_id_fkey\":\"language\".\"language_id\" < \"film\".\"language_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_actor_id_fkey\":\"actor\".\"actor_id\" < \"film_actor\".\"actor_id\" [update: cascade, delete: restrict]\n\nRef \"film_actor_film_id_fkey\":\"film\".\"film_id\" < \"film_actor\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_category_id_fkey\":\"category\".\"category_id\" < \"film_category\".\"category_id\" [update: cascade, delete: restrict]\n\nRef \"film_category_film_id_fkey\":\"film\".\"film_id\" < \"film_category\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"inventory_film_id_fkey\":\"film\".\"film_id\" < \"inventory\".\"film_id\" [update: cascade, delete: restrict]\n\nRef \"payment_customer_id_fkey\":\"customer\".\"customer_id\" < \"payment\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"payment_rental_id_fkey\":\"rental\".\"rental_id\" < \"payment\".\"rental_id\" [update: cascade, delete: set null]\n\nRef \"payment_staff_id_fkey\":\"staff\".\"staff_id\" < \"payment\".\"staff_id\" [update: cascade, delete: restrict]\n\nRef \"rental_customer_id_fkey\":\"customer\".\"customer_id\" < \"rental\".\"customer_id\" [update: cascade, delete: restrict]\n\nRef \"rental_inventory_id_fkey\":\"inventory\".\"inventory_id\" < \"rental\".\"inventory_id\" [update: cascade, delete: restrict]\n\nRef \"rental_staff_id_key\":\"staff\".\"staff_id\" < \"rental\".\"staff_id\"\n\nRef \"staff_address_id_fkey\":\"address\".\"address_id\" < \"staff\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_address_id_fkey\":\"address\".\"address_id\" < \"store\".\"address_id\" [update: cascade, delete: restrict]\n\nRef \"store_manager_staff_id_fkey\":\"staff\".\"staff_id\" < \"store\".\"manager_staff_id\" [update: cascade, delete: restrict]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/gordonkwokkwok/DVD-Rental-PostgreSQL-Project"
}
}
================================================
FILE: tasks/postgres/standard/dvdrental/film_inventory_management/verify.py
================================================
"""
Verification script for PostgreSQL Task 4: Film Inventory Management
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""Compare two rows with appropriate tolerance for decimals and floats."""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)):
# Use higher tolerance for floating point comparisons
if abs(float(actual) - float(expected)) > 0.01:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def check_new_films(conn) -> bool:
"""Check if the two new films were added correctly."""
with conn.cursor() as cur:
cur.execute("""
SELECT title, description, release_year, language_id,
rental_duration, rental_rate, length, replacement_cost,
rating
FROM film
WHERE title IN ('Data Science Adventures', 'Cloud Computing Chronicles')
ORDER BY title
""")
actual_films = cur.fetchall()
expected_films = [
('Cloud Computing Chronicles', 'Exploring the world of distributed systems', 2024, 1, 7, Decimal('4.99'), 135, Decimal('18.99'), 'PG'),
('Data Science Adventures', 'A thrilling journey through machine learning algorithms', 2024, 1, 5, Decimal('4.389'), 120, Decimal('15.99'), 'PG-13')
]
if len(actual_films) != len(expected_films):
print(f"❌ Expected {len(expected_films)} new films, found {len(actual_films)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_films, expected_films)):
if not rows_match(actual, expected):
print(f"❌ Film {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total film mismatches: {mismatches}")
return False
print("✅ Both new films added correctly")
return True
def check_inventory_records(conn) -> bool:
"""Check if inventory records were added for new films."""
with conn.cursor() as cur:
cur.execute("""
SELECT f.title, i.store_id, COUNT(*) as count
FROM film f
JOIN inventory i ON f.film_id = i.film_id
WHERE f.title IN ('Data Science Adventures', 'Cloud Computing Chronicles')
GROUP BY f.title, i.store_id
ORDER BY f.title, i.store_id
""")
actual_inventory = cur.fetchall()
expected_inventory = [
('Cloud Computing Chronicles', 1, 3),
('Cloud Computing Chronicles', 2, 2),
('Data Science Adventures', 1, 3),
('Data Science Adventures', 2, 2)
]
if len(actual_inventory) != len(expected_inventory):
print(f"❌ Expected {len(expected_inventory)} inventory groups, found {len(actual_inventory)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_inventory, expected_inventory)):
if not rows_match(actual, expected):
print(f"❌ Inventory group {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total inventory mismatches: {mismatches}")
return False
print("✅ Inventory records added correctly")
return True
def check_available_films_table(conn) -> bool:
"""Check if available_films table was created and populated correctly."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT film_id, title, rental_rate, length
FROM available_films
ORDER BY rental_rate DESC, length DESC, title ASC
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
SELECT DISTINCT f.film_id, f.title, f.rental_rate, f.length
FROM film f
JOIN inventory i ON f.film_id = i.film_id
WHERE f.rental_rate >= 3.00 AND f.rental_rate <= 5.00
AND f.length > 100
AND i.store_id = 1
ORDER BY f.rental_rate DESC, f.length DESC, f.title ASC
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ available_films table has {len(actual_results)} records, expected {len(expected_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ available_films row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total available_films mismatches: {mismatches}")
return False
print(f"✅ available_films table created and populated correctly ({len(actual_results)} records)")
return True
def check_inventory_cleanup(conn) -> bool:
"""Check if inventory cleanup was performed correctly."""
with conn.cursor() as cur:
# Check that no inventory exists for films with replacement_cost > 25 AND rental_rate < 1
# that also don't have rental records (safe to delete)
cur.execute("""
SELECT COUNT(*)
FROM inventory i
JOIN film f ON i.film_id = f.film_id
WHERE f.replacement_cost > 25.00 AND f.rental_rate < 1.00
AND NOT EXISTS (SELECT 1 FROM rental r WHERE r.inventory_id = i.inventory_id)
""")
remaining_count = cur.fetchone()[0]
if remaining_count > 0:
print(f"❌ Found {remaining_count} inventory records that should have been deleted (no rental history)")
return False
print("✅ Inventory cleanup completed correctly")
return True
def check_summary_table(conn) -> bool:
"""Check if film_inventory_summary table was created and populated correctly."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT title, rental_rate, total_inventory, store1_count, store2_count
FROM film_inventory_summary
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
SELECT f.title, f.rental_rate,
COUNT(i.inventory_id) as total_inventory,
COUNT(CASE WHEN i.store_id = 1 THEN 1 END) as store1_count,
COUNT(CASE WHEN i.store_id = 2 THEN 1 END) as store2_count
FROM film f
JOIN inventory i ON f.film_id = i.film_id
GROUP BY f.film_id, f.title, f.rental_rate
ORDER BY total_inventory DESC, f.title ASC
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ film_inventory_summary table has {len(actual_results)} records, expected {len(expected_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Summary row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total summary table mismatches: {mismatches}")
return False
print(f"✅ film_inventory_summary table created and populated correctly ({len(actual_results)} records)")
return True
def main():
"""Main verification function."""
print("=" * 70)
print("PostgreSQL Task 4 Verification: Film Inventory Management")
print("=" * 70)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all operations with short-circuit evaluation
success = (
check_new_films(conn) and
check_inventory_records(conn) and
check_available_films_table(conn) and
check_inventory_cleanup(conn) and
check_summary_table(conn)
)
conn.close()
if success:
print(f"\n🎉 Task verification: PASS")
sys.exit(0)
else:
print(f"\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/employees/employee_demographics_report/description.md
================================================
Generate a comprehensive employee demographics and basic statistics report for the annual company overview. The HR team needs simple, clear statistical summaries about our workforce composition to include in the annual report and diversity initiatives.
## Your Tasks:
1. **Create the gender statistics table** — build a table called `gender_statistics` in the `employees` schema with these exact columns:
* `gender` (varchar) — gender ('M' or 'F')
* `total_employees` (integer) — total number of employees of this gender
* `current_employees` (integer) — current employees of this gender (have active salary)
* `percentage_of_workforce` (decimal) — percentage of current workforce
2. **Create the age group analysis table** — build a table called `age_group_analysis` in the `employees` schema with:
* `age_group` (varchar) — age range ('20-29', '30-39', '40-49', '50-59', '60+')
* `employee_count` (integer) — number of current employees in age group
* `avg_salary` (decimal) — average current salary for age group
* `avg_tenure_days` (decimal) — average days of service
3. **Create the birth month distribution table** — build a table called `birth_month_distribution` in the `employees` schema with:
* `birth_month` (integer) — month number (1-12)
* `month_name` (varchar) — month name ('January', 'February', etc.)
* `employee_count` (integer) — total employees born in this month
* `current_employee_count` (integer) — current employees born in this month
4. **Create the hiring year summary table** — build a table called `hiring_year_summary` in the `employees` schema with:
* `hire_year` (integer) — year employees were hired
* `employees_hired` (integer) — number of employees hired that year
* `still_employed` (integer) — how many from that year are still employed
* `retention_rate` (decimal) — percentage still employed (still_employed/employees_hired * 100)
5. **Apply age group classification** based on current age:
* **20-29**: Ages 20-29
* **30-39**: Ages 30-39
* **40-49**: Ages 40-49
* **50-59**: Ages 50-59
* **60+**: Ages 60 and above
6. **Calculate workforce composition** — determine current workforce demographics using employees with active salary records (to_date = '9999-01-01').
7. **Focus on basic statistics** — create simple counts, averages, and percentages that are easy to understand and verify.
The analysis will provide clear demographic insights for HR reporting and workforce planning.
================================================
FILE: tasks/postgres/standard/employees/employee_demographics_report/meta.json
================================================
{
"task_id": "employee_demographics_report",
"task_name": "Employee Demographics Report",
"category_id": "employees",
"category_name": "Employees",
"description": "Generate comprehensive employee demographics report with gender statistics, age groups, birth months, and hiring trends.",
"author": "Lingxiao Du",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"reporting and analytics",
"statistical aggregation"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/standard/employees/employee_demographics_report/verify.py
================================================
"""
Verification script for PostgreSQL Task 3: Employee Demographics Report
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_gender_statistics_results(conn) -> bool:
"""Verify the gender statistics results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT gender, total_employees, current_employees, percentage_of_workforce
FROM employees.gender_statistics
ORDER BY gender
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH current_emp AS (
SELECT DISTINCT s.employee_id
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
),
total_current AS (
SELECT COUNT(*) AS cnt
FROM current_emp
)
SELECT
e.gender::varchar AS gender,
COUNT(*) AS total_employees,
COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL) AS current_employees,
(COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL
/ NULLIF((SELECT cnt FROM total_current), 0) * 100 AS percentage_of_workforce
FROM employees.employee e
LEFT JOIN current_emp ce ON ce.employee_id = e.id
WHERE e.gender IN ('M','F')
GROUP BY e.gender
ORDER BY gender;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} gender statistics results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Gender statistics results are correct ({len(actual_results)} records)")
return True
def verify_age_group_results(conn) -> bool:
"""Verify the age group analysis results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT age_group, employee_count, avg_salary, avg_tenure_days
FROM employees.age_group_analysis
ORDER BY age_group
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH current_salary AS (
SELECT employee_id, amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (
PARTITION BY s.employee_id
ORDER BY s.from_date DESC, s.amount DESC
) AS rn
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
emp_age AS (
SELECT
e.id AS employee_id,
e.hire_date,
EXTRACT(YEAR FROM AGE(CURRENT_DATE, e.birth_date))::INT AS age_years
FROM employees.employee e
WHERE e.birth_date IS NOT NULL
)
SELECT
CASE
WHEN a.age_years BETWEEN 20 AND 29 THEN '20-29'
WHEN a.age_years BETWEEN 30 AND 39 THEN '30-39'
WHEN a.age_years BETWEEN 40 AND 49 THEN '40-49'
WHEN a.age_years BETWEEN 50 AND 59 THEN '50-59'
WHEN a.age_years >= 60 THEN '60+'
END AS age_group,
COUNT(*)::INT AS employee_count,
AVG(cs.amount) AS avg_salary,
AVG((CURRENT_DATE - a.hire_date)::INT) AS avg_tenure_days
FROM emp_age a
JOIN current_salary cs ON cs.employee_id = a.employee_id
WHERE a.age_years >= 20
GROUP BY 1
ORDER BY 1;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} age group results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Age group analysis results are correct ({len(actual_results)} records)")
return True
def verify_birth_month_results(conn) -> bool:
"""Verify the birth month distribution results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT birth_month, month_name, employee_count, current_employee_count
FROM employees.birth_month_distribution
ORDER BY birth_month
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH current_emp AS (
SELECT DISTINCT s.employee_id
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
),
months AS (
SELECT gs AS birth_month
FROM generate_series(1, 12) AS gs
)
SELECT
m.birth_month::INTEGER AS birth_month,
CASE m.birth_month
WHEN 1 THEN 'January' WHEN 2 THEN 'February' WHEN 3 THEN 'March'
WHEN 4 THEN 'April' WHEN 5 THEN 'May' WHEN 6 THEN 'June'
WHEN 7 THEN 'July' WHEN 8 THEN 'August' WHEN 9 THEN 'September'
WHEN 10 THEN 'October' WHEN 11 THEN 'November'WHEN 12 THEN 'December'
END AS month_name,
COUNT(e.id)::INTEGER AS employee_count,
COUNT(ce.employee_id)::INTEGER AS current_employee_count
FROM months m
LEFT JOIN employees.employee e
ON EXTRACT(MONTH FROM e.birth_date) = m.birth_month
LEFT JOIN current_emp ce
ON ce.employee_id = e.id
GROUP BY m.birth_month
ORDER BY m.birth_month;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} birth month results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Birth month distribution results are correct ({len(actual_results)} records)")
return True
def verify_hiring_year_results(conn) -> bool:
"""Verify the hiring year summary results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT hire_year, employees_hired, still_employed, retention_rate
FROM employees.hiring_year_summary
ORDER BY hire_year
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH current_emp AS (
SELECT DISTINCT s.employee_id
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
),
base AS (
SELECT e.id, EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year
FROM employees.employee e
WHERE e.hire_date IS NOT NULL
)
SELECT
b.hire_year,
COUNT(*)::INT AS employees_hired,
COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL)::INT AS still_employed,
(COUNT(*) FILTER (WHERE ce.employee_id IS NOT NULL))::DECIMAL
/ NULLIF(COUNT(*), 0) * 100 AS retention_rate
FROM base b
LEFT JOIN current_emp ce ON ce.employee_id = b.id
GROUP BY b.hire_year
ORDER BY b.hire_year;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} hiring year results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Hiring year summary results are correct ({len(actual_results)} records)")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all four analysis results
success = (
verify_gender_statistics_results(conn) and
verify_age_group_results(conn) and
verify_birth_month_results(conn) and
verify_hiring_year_results(conn)
)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/employees/employee_performance_analysis/description.md
================================================
Create a comprehensive employee performance evaluation system that analyzes career progression patterns and salary equity across our organization. The executive team needs data-driven insights for upcoming promotion decisions and salary adjustment planning.
## Your Tasks:
1. **Create the employee performance analysis table** — build a table called `employee_performance_analysis` in the `employees` schema with these exact columns:
* `employee_id` (bigint) — the employee's ID
* `performance_category` (varchar) — classification of employee performance ('high_achiever', 'steady_performer', 'needs_attention')
* `salary_growth_rate` (decimal) — percentage salary increase from first salary record to current
* `days_of_service` (integer) — total days with the company
* `promotion_count` (integer) — number of different titles held
2. **Analyze only current employees** — focus on employees who currently have active salary records (to_date = '9999-01-01').
3. **Apply performance classification rules**:
* **High achievers**: Salary growth rate > 40% AND more than 1 title held
* **Needs attention**: Salary growth rate < 15% AND more than 3650 days of service (10 years)
* **Steady performers**: All other current employees (default category)
4. **Create the department salary analysis table** — build a table called `department_salary_analysis` in the `employees` schema with:
* `department_name` (varchar) — the department name
* `avg_current_salary` (decimal) — average current salary in the department (only current employees)
* `employee_count` (integer) — total current employees in the department
* `salary_range_spread` (integer) — difference between max and min salary (current employees only)
5. **Calculate salary equity metrics** — populate the department table with current salary statistics for active employees only to identify potential pay equity issues across departments.
The analysis should help leadership make informed decisions about promotions, salary adjustments, and talent retention strategies.
================================================
FILE: tasks/postgres/standard/employees/employee_performance_analysis/meta.json
================================================
{
"task_id": "employee_performance_analysis",
"task_name": "Employee Performance Analysis",
"category_id": "employees",
"category_name": "Employees",
"description": "Create performance evaluation system analyzing career progression patterns and salary equity for promotion and compensation decisions.",
"author": "Lingxiao Du",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"reporting and analytics",
"statistical aggregation",
"schema design"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/standard/employees/employee_performance_analysis/verify.py
================================================
"""
Verification script for PostgreSQL Task 1: Employee Performance Analysis
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_performance_results(conn) -> bool:
"""Verify the employee performance analysis results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT employee_id, performance_category, salary_growth_rate,
days_of_service, promotion_count
FROM employees.employee_performance_analysis
ORDER BY employee_id
""")
actual_results = cur.fetchall()
# Execute ground truth query - use first salary record as starting salary
cur.execute("""
WITH current_salary AS (
SELECT employee_id, amount AS current_amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (PARTITION BY s.employee_id
ORDER BY s.from_date DESC, s.amount DESC) AS rn
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
first_salary AS (
SELECT employee_id, amount AS first_amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (PARTITION BY s.employee_id
ORDER BY s.from_date ASC, s.amount ASC) AS rn
FROM employees.salary s
) x
WHERE rn = 1
),
title_counts AS (
SELECT t.employee_id, COUNT(DISTINCT t.title) AS promotion_count
FROM employees.title t
GROUP BY t.employee_id
),
base AS (
SELECT e.id AS employee_id,
e.hire_date,
cs.current_amount,
fs.first_amount,
COALESCE(tc.promotion_count, 0) AS promotion_count
FROM employees.employee e
JOIN current_salary cs ON cs.employee_id = e.id
JOIN first_salary fs ON fs.employee_id = e.id
LEFT JOIN title_counts tc ON tc.employee_id = e.id
),
scored AS (
SELECT
employee_id,
((current_amount - first_amount) / NULLIF(first_amount, 0)::NUMERIC) * 100 AS salary_growth_rate,
(CURRENT_DATE - hire_date)::INTEGER AS days_of_service,
promotion_count
FROM base
)
SELECT
s.employee_id,
CASE
WHEN s.salary_growth_rate > 40 AND s.promotion_count > 1 THEN 'high_achiever'
WHEN s.salary_growth_rate < 15 AND s.days_of_service > 3650 THEN 'needs_attention'
ELSE 'steady_performer'
END AS performance_category,
s.salary_growth_rate,
s.days_of_service,
s.promotion_count AS promotion_count
FROM scored s
ORDER BY s.employee_id;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} performance results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Employee performance results are correct ({len(actual_results)} records)")
return True
def verify_department_results(conn) -> bool:
"""Verify the department salary analysis results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT department_name, avg_current_salary, employee_count, salary_range_spread
FROM employees.department_salary_analysis
ORDER BY department_name
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH current_salary AS (
SELECT employee_id, amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (PARTITION BY s.employee_id
ORDER BY s.from_date DESC, s.amount DESC) AS rn
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
current_dept AS (
SELECT DISTINCT de.employee_id, de.department_id
FROM employees.department_employee de
WHERE de.to_date = DATE '9999-01-01'
)
SELECT
d.dept_name AS department_name,
AVG(cs.amount)::DECIMAL AS avg_current_salary,
COUNT(DISTINCT cd.employee_id) AS employee_count,
(MAX(cs.amount) - MIN(cs.amount)) AS salary_range_spread
FROM employees.department d
JOIN current_dept cd ON cd.department_id = d.id
JOIN current_salary cs ON cs.employee_id = cd.employee_id
GROUP BY d.id, d.dept_name
ORDER BY d.dept_name;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} department results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Department salary results are correct ({len(actual_results)} records)")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify results
success = verify_performance_results(conn) and verify_department_results(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/employees/employee_project_tracking/description.md
================================================
Create and manage a comprehensive employee project tracking system using database schema design and data manipulation operations. The IT team needs you to build the database structure from scratch and populate it with specific initial data to support project management workflows.
## Your Tasks:
1. **Create the project tracking tables** — build three new tables in the `employees` schema:
**Table 1: `employee_projects`**
* `project_id` (integer, primary key, auto-increment)
* `project_name` (varchar(100), not null)
* `start_date` (date, not null)
* `end_date` (date)
* `budget` (decimal(10,2))
* `status` (varchar(20), default 'active')
**Table 2: `project_assignments`**
* `assignment_id` (integer, primary key, auto-increment)
* `employee_id` (bigint, not null)
* `project_id` (integer, not null)
* `role` (varchar(50), not null)
* `allocation_percentage` (integer, check constraint: between 1 and 100)
* `assigned_date` (date, not null)
**Table 3: `project_milestones`**
* `milestone_id` (integer, primary key, auto-increment)
* `project_id` (integer, not null)
* `milestone_name` (varchar(100), not null)
* `due_date` (date, not null)
* `completed` (boolean, default false)
2. **Add foreign key relationships**:
* `project_assignments.employee_id` → `employees.employee.id`
* `project_assignments.project_id` → `employees.employee_projects.project_id`
* `project_milestones.project_id` → `employees.employee_projects.project_id`
3. **Create performance indexes**:
* Index named `idx_projects_status` on `employee_projects.status`
* Composite index named `idx_assignments_emp_proj` on `project_assignments(employee_id, project_id)`
* Index named `idx_milestones_due_date` on `project_milestones.due_date`
4. **Insert exactly this initial data**:
**Into `employee_projects`:**
* Project 1: name='Database Modernization', start_date='2024-01-15', end_date='2024-06-30', budget=250000.00, status='active'
* Project 2: name='Employee Portal Upgrade', start_date='2024-02-01', end_date='2024-05-15', budget=180000.00, status='active'
* Project 3: name='HR Analytics Dashboard', start_date='2023-11-01', end_date='2024-01-31', budget=120000.00, status='active'
**Into `project_assignments` (assign ALL current employees):**
* All employees from Development department → Project 1 ('Database Modernization'), role='Developer', allocation=80%
* All employees from Human Resources department → Project 2 ('Employee Portal Upgrade'), role='Business Analyst', allocation=60%
* All employees from Marketing department → Project 3 ('HR Analytics Dashboard'), role='Marketing Specialist', allocation=40%
* All employees from Finance department → Project 1 ('Database Modernization'), role='Financial Analyst', allocation=30%
* All employees from Sales department → Project 2 ('Employee Portal Upgrade'), role='Sales Representative', allocation=50%
* All employees from Research department → Project 3 ('HR Analytics Dashboard'), role='Research Analyst', allocation=70%
* All employees from Production department → Project 1 ('Database Modernization'), role='Production Coordinator', allocation=45%
* All employees from Quality Management department → Project 2 ('Employee Portal Upgrade'), role='QA Specialist', allocation=85%
* All employees from Customer Service department → Project 3 ('HR Analytics Dashboard'), role='Customer Success', allocation=35%
* All employees should have assigned_date='2024-01-01'
**Into `project_milestones`:**
* Project 1: 'Design Phase Complete' due '2024-03-01', 'Implementation Complete' due '2024-05-15'
* Project 2: 'UI/UX Approval' due '2024-03-15', 'Beta Testing' due '2024-04-30'
* Project 3: 'Data Collection' due '2023-12-15', 'Dashboard Launch' due '2024-01-25'
5. **Perform these exact data updates**:
* Update Project 3 ('HR Analytics Dashboard') status to 'completed'
* Increase budget by 15% for all projects with status 'active'
* Mark the milestone 'Data Collection' as completed (set completed = true)
6. **Add new column to `employee_projects`**:
* Add `priority` column (varchar(10)) with check constraint allowing only 'low', 'medium', 'high'
* Update all existing projects: set priority='high' for 'Database Modernization', priority='medium' for others
================================================
FILE: tasks/postgres/standard/employees/employee_project_tracking/meta.json
================================================
{
"task_id": "employee_project_tracking",
"task_name": "Employee Project Tracking",
"category_id": "employees",
"category_name": "Employees",
"description": "Build project tracking system from scratch with tables for projects, assignments, milestones, and performance indexes.",
"author": "Lingxiao Du",
"created_at": "2025-08-14",
"difficulty": "L3",
"tags": [
"schema design",
"data migration",
"data integrity enforcement"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/standard/employees/employee_project_tracking/verify.py
================================================
"""
Verification script for PostgreSQL Task 5: Database Schema and Data Operations
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For date types: convert to string for comparison
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif hasattr(actual, 'strftime'): # datetime.date or datetime.datetime
if str(actual) != str(expected):
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_table_structures(conn) -> bool:
"""Verify that all three tables were created with correct structure."""
with conn.cursor() as cur:
# Check if tables exist
cur.execute("""
SELECT table_name FROM information_schema.tables
WHERE table_schema = 'employees'
AND table_name IN ('employee_projects', 'project_assignments', 'project_milestones')
ORDER BY table_name
""")
tables = [row[0] for row in cur.fetchall()]
if len(tables) != 3:
print(f"❌ Expected 3 tables, found {len(tables)}: {tables}")
return False
# Check foreign key constraints exist
cur.execute("""
SELECT COUNT(*) FROM information_schema.table_constraints
WHERE table_schema = 'employees'
AND constraint_type = 'FOREIGN KEY'
AND table_name IN ('project_assignments', 'project_milestones')
""")
fkey_count = cur.fetchone()[0]
if fkey_count != 3:
print(f"❌ Expected 3 foreign key constraints, found {fkey_count}")
return False
# Check if priority column exists (added in step 6)
cur.execute("""
SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = 'employees' AND table_name = 'employee_projects'
AND column_name = 'priority'
""")
priority_exists = cur.fetchone()[0]
if priority_exists == 0:
print("❌ Priority column was not added to employee_projects table")
return False
print("✅ Table structures are correct")
return True
def verify_indexes(conn) -> bool:
"""Verify that required indexes were created."""
with conn.cursor() as cur:
# Check for specific indexes
cur.execute("""
SELECT COUNT(*)
FROM pg_indexes
WHERE schemaname = 'employees'
AND indexname IN ('idx_projects_status', 'idx_assignments_emp_proj', 'idx_milestones_due_date')
""")
index_count = cur.fetchone()[0]
if index_count != 3:
print(f"❌ Expected 3 required indexes, got {index_count}")
return False
print("✅ All required indexes are present")
return True
def verify_project_data(conn) -> bool:
"""Verify that project data was inserted and updated correctly."""
with conn.cursor() as cur:
# Check project data after updates
cur.execute("""
SELECT project_name, start_date, end_date, budget, status, priority
FROM employees.employee_projects
ORDER BY project_name
""")
projects = cur.fetchall()
if len(projects) != 3:
print(f"❌ Expected 3 projects, found {len(projects)}")
return False
# Expected final state after all updates
expected = {
'Database Modernization': ('2024-01-15', '2024-06-30', 287500.00, 'active', 'high'),
'Employee Portal Upgrade': ('2024-02-01', '2024-05-15', 207000.00, 'active', 'medium'),
'HR Analytics Dashboard': ('2023-11-01', '2024-01-31', 120000.00, 'completed', 'medium')
}
for project in projects:
name = project[0]
if name not in expected:
print(f"❌ Unexpected project: {name}")
return False
exp = expected[name]
# Use rows_match for comparison
expected_row = (name,) + exp
if not rows_match(project, expected_row):
print(f"❌ Project {name} data mismatch: expected {expected_row}, got {project}")
return False
print("✅ Project data is correct")
return True
def verify_assignment_data(conn) -> bool:
"""Verify that all current employees were assigned to projects by department."""
with conn.cursor() as cur:
# Check total assignment count matches current employee count
cur.execute("""
SELECT COUNT(*) FROM employees.project_assignments
""")
assignment_count = cur.fetchone()[0]
cur.execute("""
SELECT COUNT(DISTINCT de.employee_id)
FROM employees.department_employee de
WHERE de.to_date = '9999-01-01'
""")
current_employee_count = cur.fetchone()[0]
if assignment_count != current_employee_count:
print(f"❌ Expected {current_employee_count} assignments, found {assignment_count}")
return False
# Check department-project mapping
cur.execute("""
SELECT d.dept_name, pa.project_id, pa.role, pa.allocation_percentage, COUNT(*)
FROM employees.project_assignments pa
JOIN employees.department_employee de ON pa.employee_id = de.employee_id AND de.to_date = '9999-01-01'
JOIN employees.department d ON de.department_id = d.id
JOIN employees.employee_projects ep ON pa.project_id = ep.project_id
GROUP BY d.dept_name, pa.project_id, pa.role, pa.allocation_percentage
ORDER BY d.dept_name
""")
dept_assignments = cur.fetchall()
# Expected department-project mappings
expected_mappings = {
'Development': (1, 'Developer', 80),
'Human Resources': (2, 'Business Analyst', 60),
'Marketing': (3, 'Marketing Specialist', 40),
'Finance': (1, 'Financial Analyst', 30),
'Sales': (2, 'Sales Representative', 50),
'Research': (3, 'Research Analyst', 70),
'Production': (1, 'Production Coordinator', 45),
'Quality Management': (2, 'QA Specialist', 85),
'Customer Service': (3, 'Customer Success', 35)
}
dept_found = {}
for assignment in dept_assignments:
dept_name, project_id, role, allocation, _ = assignment # Ignore count
if dept_name in dept_found:
print(f"❌ Department {dept_name} has multiple assignments")
return False
dept_found[dept_name] = (project_id, role, allocation)
for dept, expected in expected_mappings.items():
if dept not in dept_found:
print(f"❌ Department {dept} has no assignments")
return False
if dept_found[dept] != expected:
print(f"❌ Department {dept} assignment mismatch: expected {expected}, got {dept_found[dept]}")
return False
# Check that all assignments have correct assigned_date
cur.execute("""
SELECT COUNT(*) FROM employees.project_assignments
WHERE assigned_date != '2024-01-01'
""")
wrong_date_count = cur.fetchone()[0]
if wrong_date_count > 0:
print(f"❌ {wrong_date_count} assignments have incorrect assigned_date")
return False
print("✅ Assignment data is correct")
return True
def verify_milestone_data(conn) -> bool:
"""Verify that milestone data was inserted and updated correctly."""
with conn.cursor() as cur:
cur.execute("""
SELECT project_id, milestone_name, due_date, completed
FROM employees.project_milestones
ORDER BY project_id, milestone_name
""")
milestones = cur.fetchall()
if len(milestones) != 6:
print(f"❌ Expected 6 milestones, found {len(milestones)}")
return False
# Expected milestones
expected_milestones = {
(1, 'Design Phase Complete'): ('2024-03-01', False),
(1, 'Implementation Complete'): ('2024-05-15', False),
(2, 'UI/UX Approval'): ('2024-03-15', False),
(2, 'Beta Testing'): ('2024-04-30', False),
(3, 'Data Collection'): ('2023-12-15', True), # Should be completed
(3, 'Dashboard Launch'): ('2024-01-25', False)
}
for milestone in milestones:
project_id, name, due_date, completed = milestone
key = (project_id, name)
if key not in expected_milestones:
print(f"❌ Unexpected milestone: {key}")
return False
expected_due, expected_completed = expected_milestones[key]
if str(due_date) != expected_due or completed != expected_completed:
print(f"❌ Milestone {name} mismatch: expected ({expected_due}, {expected_completed}), got ({due_date}, {completed})")
return False
print("✅ Milestone data is correct")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all components
success = (
verify_table_structures(conn) and
verify_indexes(conn) and
verify_project_data(conn) and
verify_assignment_data(conn) and
verify_milestone_data(conn)
)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/employees/employee_retention_analysis/description.md
================================================
Analyze employee retention patterns and identify factors contributing to turnover across the organization. The HR leadership team needs comprehensive insights to develop targeted retention strategies and reduce costly employee attrition.
## Your Tasks:
1. **Create the retention analysis table** — build a table called `employee_retention_analysis` in the `employees` schema with these exact columns:
* `department_name` (varchar) — the department name
* `total_employees_ever` (integer) — total number of employees who have ever worked in this department
* `current_employees` (integer) — number of current employees in the department
* `former_employees` (integer) — number of employees who left the department
* `retention_rate` (decimal) — percentage of employees still with the company (current/total * 100)
2. **Create the high-risk employee identification table** — build a table called `high_risk_employees` in the `employees` schema with:
* `employee_id` (bigint) — the employee's ID
* `full_name` (varchar) — concatenated first and last name
* `current_department` (varchar) — current department name
* `tenure_days` (integer) — days with the company
* `current_salary` (integer) — current salary amount
* `risk_category` (varchar) — risk level ('high_risk', 'medium_risk', 'low_risk')
**Note**: Analyze only current employees (those with active salary records where to_date = '9999-01-01').
3. **Create the turnover trend analysis table** — build a table called `turnover_trend_analysis` in the `employees` schema with:
* `departure_year` (integer) — year when employees left (extract from to_date of salary records)
* `departures_count` (integer) — number of employees who left that year
* `avg_tenure_days` (decimal) — average tenure in days for employees who left that year
* `avg_final_salary` (decimal) — average final salary of departed employees that year
4. **Apply risk assessment criteria** for current employees:
* **High risk**: Employees in departments with retention rate < 80% AND tenure < 1095 days (3 years)
* **Medium risk**: Employees in departments with retention rate < 85% AND tenure < 1825 days (5 years)
* **Low risk**: All other current employees
5. **Analyze departure trends** — examine employees who left between 1985-2002, grouping by departure year.
6. **Handle final salary selection** — when calculating `avg_final_salary`, if an employee has multiple salary records with the same departure date, select the record with the latest start date. If there are still ties, select the record with the highest salary amount.
7. **Focus appropriately** — use current employees for risk analysis, all historical data for retention rates, and former employees for trend analysis.
The comprehensive analysis will help identify retention patterns, at-risk employees, and historical turnover trends to guide strategic workforce planning.
================================================
FILE: tasks/postgres/standard/employees/employee_retention_analysis/meta.json
================================================
{
"task_id": "employee_retention_analysis",
"task_name": "Employee Retention Analysis",
"category_id": "employees",
"category_name": "Employees",
"description": "Analyze retention patterns identifying turnover factors and high-risk employees to develop targeted retention strategies.",
"author": "Lingxiao Du",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"reporting and analytics",
"statistical aggregation",
"audit and compliance"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/standard/employees/employee_retention_analysis/verify.py
================================================
"""
Verification script for PostgreSQL Task 2: Employee Retention Analysis
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_retention_analysis_results(conn) -> bool:
"""Verify the employee retention analysis results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT department_name, total_employees_ever, current_employees,
former_employees, retention_rate
FROM employees.employee_retention_analysis
ORDER BY department_name
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
SELECT
d.dept_name AS department_name,
COUNT(DISTINCT de.employee_id) AS total_employees_ever,
COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01') AS current_employees,
(COUNT(DISTINCT de.employee_id)
- COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01')) AS former_employees,
(COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01'))::DECIMAL
/ NULLIF(COUNT(DISTINCT de.employee_id), 0) * 100 AS retention_rate
FROM employees.department d
LEFT JOIN employees.department_employee de
ON d.id = de.department_id
GROUP BY d.id, d.dept_name
ORDER BY d.dept_name
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} retention analysis results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Employee retention analysis results are correct ({len(actual_results)} records)")
return True
def verify_high_risk_results(conn) -> bool:
"""Verify the high risk employee analysis results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT employee_id, full_name, current_department, tenure_days,
current_salary, risk_category
FROM employees.high_risk_employees
ORDER BY employee_id
""")
actual_results = cur.fetchall()
# Execute ground truth query - only current employees
cur.execute("""
WITH current_salary AS (
SELECT employee_id, amount AS current_amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (PARTITION BY s.employee_id
ORDER BY s.from_date DESC, s.amount DESC) AS rn
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
current_dept AS (
SELECT employee_id, department_id
FROM (
SELECT de.*,
ROW_NUMBER() OVER (PARTITION BY de.employee_id
ORDER BY de.from_date DESC, de.department_id) AS rn
FROM employees.department_employee de
WHERE de.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
dept_retention AS (
SELECT
d.id AS department_id,
d.dept_name,
COUNT(DISTINCT de.employee_id) AS total_employees_ever,
COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01') AS current_employees,
(COUNT(DISTINCT de.employee_id) FILTER (WHERE de.to_date = DATE '9999-01-01'))::NUMERIC
/ NULLIF(COUNT(DISTINCT de.employee_id), 0) * 100 AS retention_rate
FROM employees.department d
LEFT JOIN employees.department_employee de
ON de.department_id = d.id
GROUP BY d.id, d.dept_name
)
SELECT
e.id AS employee_id,
CONCAT(e.first_name, ' ', e.last_name) AS full_name,
d.dept_name AS current_department,
(CURRENT_DATE - e.hire_date)::INTEGER AS tenure_days,
cs.current_amount::INTEGER AS current_salary,
CASE
WHEN dr.retention_rate < 80 AND (CURRENT_DATE - e.hire_date) < 1095 THEN 'high_risk'
WHEN dr.retention_rate < 85 AND (CURRENT_DATE - e.hire_date) < 1825 THEN 'medium_risk'
ELSE 'low_risk'
END AS risk_category
FROM employees.employee e
JOIN current_salary cs ON cs.employee_id = e.id
JOIN current_dept cd ON cd.employee_id = e.id
JOIN employees.department d ON d.id = cd.department_id
JOIN dept_retention dr ON dr.department_id = d.id
ORDER BY e.id;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} high risk analysis results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ High risk employee analysis results are correct ({len(actual_results)} records)")
return True
def verify_turnover_trend_results(conn) -> bool:
"""Verify the turnover trend analysis results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT departure_year, departures_count, avg_tenure_days, avg_final_salary
FROM employees.turnover_trend_analysis
ORDER BY departure_year
""")
actual_results = cur.fetchall()
# Execute ground truth query - simplified version
cur.execute("""
WITH last_non_current_salary AS (
SELECT
s.employee_id,
s.to_date AS departure_date,
s.amount AS final_salary,
ROW_NUMBER() OVER (
PARTITION BY s.employee_id
ORDER BY s.to_date DESC, s.from_date DESC, s.amount DESC
) AS rn
FROM employees.salary s
WHERE s.to_date <> DATE '9999-01-01'
AND NOT EXISTS (
SELECT 1
FROM employees.salary s_cur
WHERE s_cur.employee_id = s.employee_id
AND s_cur.to_date = DATE '9999-01-01'
)
),
departed AS (
SELECT employee_id, departure_date, final_salary
FROM last_non_current_salary
WHERE rn = 1
),
with_tenure AS (
SELECT
e.id AS employee_id,
d.departure_date,
d.final_salary,
(d.departure_date - e.hire_date)::INTEGER AS tenure_days
FROM employees.employee e
JOIN departed d ON d.employee_id = e.id
)
SELECT
EXTRACT(YEAR FROM departure_date)::INTEGER AS departure_year,
COUNT(*)::INTEGER AS departures_count,
AVG(tenure_days) AS avg_tenure_days,
AVG(final_salary) AS avg_final_salary
FROM with_tenure
WHERE departure_date BETWEEN DATE '1985-01-01' AND DATE '2002-12-31'
GROUP BY EXTRACT(YEAR FROM departure_date)
ORDER BY departure_year;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} turnover trend results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Turnover trend analysis results are correct ({len(actual_results)} records)")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all three analysis results
success = (
verify_retention_analysis_results(conn) and
verify_high_risk_results(conn) and
verify_turnover_trend_results(conn)
)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/employees/executive_dashboard_automation/description.md
================================================
Design a comprehensive reporting and automation system for executive dashboard and real-time monitoring. The executive team needs automated reports, data views, and trigger-based notifications to track key business metrics without manual intervention.
## Your Tasks:
1. **Create executive summary views** — build three materialized views in the `employees` schema:
**View 1: `exec_department_summary`**
* `department_name` (varchar) — department name
* `total_employees` (integer) — current active employee count
* `avg_salary` (decimal) — average current salary
* `total_payroll` (bigint) — total monthly payroll cost
* `manager_name` (varchar) — current department manager name
**View 2: `exec_hiring_trends`**
* `hire_year` (integer) — year employees were hired
* `employees_hired` (integer) — number hired that year
* `avg_starting_salary` (decimal) — average first salary of hires that year
* `retention_rate` (decimal) — percentage still employed
* `top_hiring_department` (varchar) — department that hired the most that year
**View 3: `exec_salary_distribution`**
* `salary_band` (varchar) — salary ranges ('30K-50K', '50K-70K', '70K-90K', '90K-110K', '110K+')
* `employee_count` (integer) — employees in this salary band
* `percentage_of_workforce` (decimal) — percentage of total workforce
* `most_common_title` (varchar) — most frequent job title in this band
2. **Create stored procedure for report generation**:
**Procedure: `generate_monthly_report(report_date DATE)`**
* Create a table `monthly_reports` with columns: report_id (auto-increment), report_date, department_count, total_employees (current active employees only), avg_salary, generated_at
* Insert one summary record using the report_date as identifier and current database statistics (not historical data for that date)
* Return the generated report_id
3. **Create notification triggers**:
**Trigger: `high_salary_alert`**
* Fires when a new salary record is inserted with amount > 120000
* Inserts alert into `salary_alerts` table with: employee_id, salary_amount, alert_date, status='new'
4. **Insert test data to verify triggers**:
* Update employee 10001's current salary: first set their current salary record to_date='2024-01-31', then insert new salary record with amount 125000, from_date='2024-02-01', to_date='9999-01-01'
* Refresh all materialized views after inserting new data to ensure they reflect the updated information
5. **Execute the stored procedure**:
* Call `generate_monthly_report('2024-01-01')` to create January report
* Query the generated report to verify execution
6. **Create performance indexes**:
* Index on `salary_alerts.status` for alert processing
* Composite index on `monthly_reports(report_date, department_count)` for trend analysis
================================================
FILE: tasks/postgres/standard/employees/executive_dashboard_automation/meta.json
================================================
{
"task_id": "executive_dashboard_automation",
"task_name": "Executive Dashboard Automation",
"category_id": "employees",
"category_name": "Employees",
"description": "Design automated reporting system with materialized views, stored procedures, and triggers for executive dashboard monitoring.",
"author": "Lingxiao Du",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"reporting and analytics",
"stored procedures and functions",
"schema design"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/standard/employees/executive_dashboard_automation/verify.py
================================================
"""
Verification script for PostgreSQL Task 6: Reporting and Automation System
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For date types: convert to string for comparison
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, (Decimal, float, int)):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif hasattr(actual, 'strftime'): # datetime.date or datetime.datetime
if str(actual) != str(expected):
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_materialized_views(conn) -> bool:
"""Verify that materialized views were created and populated correctly."""
with conn.cursor() as cur:
# Check if materialized views exist
cur.execute("""
SELECT matviewname FROM pg_matviews
WHERE schemaname = 'employees'
AND matviewname IN ('exec_department_summary', 'exec_hiring_trends', 'exec_salary_distribution')
ORDER BY matviewname
""")
views = [row[0] for row in cur.fetchall()]
expected_views = ['exec_department_summary', 'exec_hiring_trends', 'exec_salary_distribution']
if set(views) != set(expected_views):
print(f"❌ Expected views {expected_views}, found {views}")
return False
# Check all departments' data accuracy
cur.execute("""
SELECT department_name, total_employees, avg_salary, total_payroll, manager_name
FROM employees.exec_department_summary
ORDER BY department_name
""")
view_data = cur.fetchall()
# Get actual data for all departments
cur.execute("""
WITH current_salary AS (
SELECT employee_id, amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (
PARTITION BY s.employee_id
ORDER BY s.from_date DESC, s.amount DESC
) AS rn
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
current_dept AS (
SELECT DISTINCT de.employee_id, de.department_id
FROM employees.department_employee de
WHERE de.to_date = DATE '9999-01-01'
),
current_manager AS (
SELECT department_id,
CONCAT(e.first_name, ' ', e.last_name) AS manager_name
FROM (
SELECT dm.*,
ROW_NUMBER() OVER (
PARTITION BY dm.department_id
ORDER BY dm.from_date DESC, dm.employee_id
) AS rn
FROM employees.department_manager dm
WHERE dm.to_date = DATE '9999-01-01'
) dm
JOIN employees.employee e ON e.id = dm.employee_id
WHERE dm.rn = 1
)
SELECT
d.dept_name AS department_name,
COUNT(cd.employee_id)::INT AS total_employees,
AVG(cs.amount)::DECIMAL AS avg_salary,
COALESCE(SUM(cs.amount), 0)::BIGINT AS total_payroll,
cm.manager_name
FROM employees.department d
LEFT JOIN current_dept cd ON cd.department_id = d.id
LEFT JOIN current_salary cs ON cs.employee_id = cd.employee_id
LEFT JOIN current_manager cm ON cm.department_id = d.id
GROUP BY d.id, d.dept_name, cm.manager_name
ORDER BY d.dept_name;
""")
actual_data = cur.fetchall()
if len(view_data) != len(actual_data):
print(f"❌ Department count mismatch: view={len(view_data)}, actual={len(actual_data)}")
return False
for view_row, actual_row in zip(view_data, actual_data):
if not rows_match(view_row, actual_row):
print(f"❌ Department summary data incorrect for {view_row[0]}: view={view_row}, actual={actual_row}")
return False
# Check all hiring trends data accuracy
cur.execute("""
SELECT hire_year, employees_hired, avg_starting_salary, retention_rate, top_hiring_department
FROM employees.exec_hiring_trends
ORDER BY hire_year
""")
hiring_view_data = cur.fetchall()
# Get actual data for all years
cur.execute("""
WITH first_salary AS (
SELECT employee_id, amount AS starting_salary
FROM (
SELECT s.*,
ROW_NUMBER() OVER (
PARTITION BY s.employee_id
ORDER BY s.from_date ASC, s.amount ASC
) AS rn
FROM employees.salary s
) x
WHERE rn = 1
),
current_emp AS (
SELECT DISTINCT s.employee_id
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
),
first_dept AS (
SELECT employee_id, department_id
FROM (
SELECT de.*,
ROW_NUMBER() OVER (
PARTITION BY de.employee_id
ORDER BY de.from_date ASC, de.department_id
) AS rn
FROM employees.department_employee de
) x
WHERE rn = 1
),
hire_base AS (
SELECT e.id AS employee_id,
EXTRACT(YEAR FROM e.hire_date)::INT AS hire_year
FROM employees.employee e
WHERE e.hire_date IS NOT NULL
),
hire_by_dept_year AS (
SELECT hb.hire_year,
d.dept_name,
COUNT(*) AS dept_hires
FROM hire_base hb
LEFT JOIN first_dept fd ON fd.employee_id = hb.employee_id
LEFT JOIN employees.department d ON d.id = fd.department_id
GROUP BY hb.hire_year, d.dept_name
),
top_dept_per_year AS (
SELECT hire_year,
dept_name AS top_hiring_department
FROM (
SELECT hire_year, dept_name, dept_hires,
ROW_NUMBER() OVER (
PARTITION BY hire_year
ORDER BY dept_hires DESC NULLS LAST, dept_name
) AS rn
FROM hire_by_dept_year
) t
WHERE rn = 1
)
SELECT
hb.hire_year,
COUNT(*)::INT AS employees_hired,
AVG(fs.starting_salary)::DECIMAL AS avg_starting_salary,
(COUNT(ce.employee_id)::DECIMAL / NULLIF(COUNT(*), 0) * 100) AS retention_rate,
td.top_hiring_department
FROM hire_base hb
LEFT JOIN first_salary fs ON fs.employee_id = hb.employee_id
LEFT JOIN current_emp ce ON ce.employee_id = hb.employee_id
LEFT JOIN top_dept_per_year td ON td.hire_year = hb.hire_year
GROUP BY hb.hire_year, td.top_hiring_department
ORDER BY hb.hire_year;
""")
actual_hiring_data = cur.fetchall()
if len(hiring_view_data) != len(actual_hiring_data):
print(f"❌ Hiring trends count mismatch: view={len(hiring_view_data)}, actual={len(actual_hiring_data)}")
return False
for hiring_view, actual_hiring in zip(hiring_view_data, actual_hiring_data):
# Now compare all 5 fields including top_hiring_department
if not rows_match(hiring_view, actual_hiring):
print(f"❌ Hiring trends data incorrect for year {hiring_view[0]}: view={hiring_view}, actual={actual_hiring}")
return False
# Check all salary bands' data accuracy
cur.execute("""
WITH band_order AS (
SELECT '30K-50K' AS band, 1 AS ord UNION ALL
SELECT '50K-70K', 2 UNION ALL
SELECT '70K-90K', 3 UNION ALL
SELECT '90K-110K',4 UNION ALL
SELECT '110K+', 5
)
SELECT salary_band, employee_count, percentage_of_workforce, most_common_title
FROM employees.exec_salary_distribution v
JOIN band_order bo ON bo.band = v.salary_band
ORDER BY bo.ord;
""")
view_bands = cur.fetchall()
# Calculate actual data for all bands
cur.execute("""
WITH current_salary AS (
SELECT employee_id, amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (
PARTITION BY s.employee_id
ORDER BY s.from_date DESC, s.amount DESC
) AS rn
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
current_title AS (
SELECT employee_id, title
FROM (
SELECT t.*,
ROW_NUMBER() OVER (
PARTITION BY t.employee_id
ORDER BY t.from_date DESC, t.title
) AS rn
FROM employees.title t
WHERE t.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
base AS (
SELECT cs.employee_id, cs.amount, COALESCE(ct.title, 'Unknown') AS title
FROM current_salary cs
LEFT JOIN current_title ct ON ct.employee_id = cs.employee_id
),
banded AS (
SELECT
CASE
WHEN amount < 50000 THEN '30K-50K'
WHEN amount < 70000 THEN '50K-70K'
WHEN amount < 90000 THEN '70K-90K'
WHEN amount < 110000 THEN '90K-110K'
ELSE '110K+'
END AS salary_band,
title,
employee_id
FROM base
),
band_counts AS (
SELECT salary_band, COUNT(DISTINCT employee_id) AS employee_count
FROM banded
GROUP BY salary_band
),
title_counts AS (
SELECT salary_band, title, COUNT(DISTINCT employee_id) AS title_count
FROM banded
GROUP BY salary_band, title
),
top_titles AS (
SELECT salary_band, title AS most_common_title
FROM (
SELECT salary_band, title, title_count,
ROW_NUMBER() OVER (
PARTITION BY salary_band
ORDER BY title_count DESC, title
) AS rn
FROM title_counts
) t
WHERE rn = 1
),
workforce AS (
SELECT COUNT(DISTINCT employee_id) AS total_current
FROM base
),
band_order AS (
SELECT '30K-50K' AS band, 1 AS ord UNION ALL
SELECT '50K-70K', 2 UNION ALL
SELECT '70K-90K', 3 UNION ALL
SELECT '90K-110K', 4 UNION ALL
SELECT '110K+', 5
)
SELECT
bc.salary_band,
bc.employee_count::INT AS employee_count,
(bc.employee_count::DECIMAL / NULLIF((SELECT total_current FROM workforce), 0) * 100) AS percentage_of_workforce,
tt.most_common_title
FROM band_counts bc
LEFT JOIN top_titles tt ON tt.salary_band = bc.salary_band
LEFT JOIN band_order bo ON bo.band = bc.salary_band
ORDER BY bo.ord;
""")
actual_bands = cur.fetchall()
# Compare view data with actual data
if len(view_bands) != len(actual_bands):
print(f"❌ Salary band count mismatch: view={len(view_bands)}, actual={len(actual_bands)}")
return False
for view_band, actual_band in zip(view_bands, actual_bands):
if not rows_match(view_band, actual_band):
print(f"❌ Salary band {actual_band[0]} data incorrect: view={view_band}, actual={actual_band}")
return False
print("✅ All materialized views are created and contain correct data")
return True
def verify_stored_procedures(conn) -> bool:
"""Verify that stored procedure was created."""
with conn.cursor() as cur:
# Check if procedure exists
cur.execute("""
SELECT routine_name FROM information_schema.routines
WHERE routine_schema = 'employees'
AND routine_type = 'FUNCTION'
AND routine_name = 'generate_monthly_report'
""")
procedures = [row[0] for row in cur.fetchall()]
if 'generate_monthly_report' not in procedures:
print("❌ generate_monthly_report procedure not found")
return False
# Check if monthly_reports table exists with correct structure
cur.execute("""
SELECT COUNT(*) FROM information_schema.columns
WHERE table_schema = 'employees' AND table_name = 'monthly_reports'
AND column_name IN ('report_id', 'report_date', 'department_count', 'total_employees', 'avg_salary', 'generated_at')
""")
report_columns = cur.fetchone()[0]
if report_columns != 6:
print("❌ monthly_reports table missing required columns")
return False
print("✅ Stored procedure and supporting table are created")
return True
def verify_triggers(conn) -> bool:
"""Verify that triggers were created and fired correctly."""
with conn.cursor() as cur:
# Check if triggers exist
cur.execute("""
SELECT trigger_name FROM information_schema.triggers
WHERE trigger_schema = 'employees'
AND trigger_name = 'high_salary_alert'
""")
triggers = [row[0] for row in cur.fetchall()]
if 'high_salary_alert' not in triggers:
print("❌ high_salary_alert trigger not found")
return False
# Check if trigger support table exists
cur.execute("""
SELECT table_name FROM information_schema.tables
WHERE table_schema = 'employees'
AND table_name = 'salary_alerts'
""")
trigger_tables = [row[0] for row in cur.fetchall()]
if 'salary_alerts' not in trigger_tables:
print("❌ salary_alerts table not found")
return False
# Check if the old salary record was properly closed
cur.execute("""
SELECT COUNT(*) FROM employees.salary
WHERE employee_id = 10001 AND to_date = '2024-01-31'
""")
old_salary_count = cur.fetchone()[0]
if old_salary_count == 0:
print("❌ Old salary record for employee 10001 was not properly closed with to_date='2024-01-31'")
return False
# Check if the new salary record was inserted
cur.execute("""
SELECT COUNT(*) FROM employees.salary
WHERE employee_id = 10001 AND amount = 125000
AND from_date = '2024-02-01' AND to_date = '9999-01-01'
""")
new_salary_count = cur.fetchone()[0]
if new_salary_count == 0:
print("❌ New salary record for employee 10001 with amount 125000 was not inserted")
return False
# Check if high salary alert was triggered with specific details
cur.execute("""
SELECT COUNT(*) FROM employees.salary_alerts
WHERE employee_id = 10001 AND salary_amount = 125000 AND status = 'new'
""")
alert_count = cur.fetchone()[0]
if alert_count == 0:
print("❌ High salary alert was not triggered correctly for employee 10001 with amount 125000")
return False
print("✅ Trigger is created and functioning correctly")
return True
def verify_procedure_execution(conn) -> bool:
"""Verify that stored procedure was executed with correct data."""
with conn.cursor() as cur:
# Check if monthly report data matches actual statistics
cur.execute("""
SELECT department_count, total_employees, avg_salary
FROM employees.monthly_reports
WHERE report_date = '2024-01-01'
""")
report_data = cur.fetchone()
if not report_data:
print("❌ Monthly report for 2024-01-01 was not generated")
return False
# Get actual current statistics to compare
cur.execute("""
WITH current_salary AS (
SELECT employee_id, amount
FROM (
SELECT s.*,
ROW_NUMBER() OVER (
PARTITION BY s.employee_id
ORDER BY s.from_date DESC, s.amount DESC
) AS rn
FROM employees.salary s
WHERE s.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
current_dept AS (
SELECT DISTINCT de.employee_id, de.department_id
FROM employees.department_employee de
WHERE de.to_date = DATE '9999-01-01'
),
base AS (
SELECT cd.department_id, cs.employee_id, cs.amount
FROM current_dept cd
JOIN current_salary cs ON cs.employee_id = cd.employee_id
)
SELECT
COUNT(DISTINCT department_id) AS actual_dept_count,
COUNT(DISTINCT employee_id) AS actual_total_employees,
AVG(amount)::DECIMAL AS actual_avg_salary
FROM base;
""")
actual_stats = cur.fetchone()
# Compare report data with actual data
if not rows_match(report_data, actual_stats):
print(f"❌ Monthly report data incorrect: expected {actual_stats}, got {report_data}")
return False
print("✅ Stored procedure executed with correct data")
return True
def verify_indexes(conn) -> bool:
"""Verify that performance indexes were created."""
with conn.cursor() as cur:
# Check for required indexes
cur.execute("""
SELECT indexname FROM pg_indexes
WHERE schemaname = 'employees'
AND tablename IN ('salary_alerts', 'monthly_reports')
AND indexname LIKE 'idx_%'
ORDER BY indexname
""")
indexes = [row[0] for row in cur.fetchall()]
# Should have at least 2 indexes created
if len(indexes) < 2:
print(f"❌ Expected at least 2 performance indexes, found {len(indexes)}")
return False
print("✅ Performance indexes are created")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all components
success = (
verify_materialized_views(conn) and
verify_stored_procedures(conn) and
verify_triggers(conn) and
verify_procedure_execution(conn) and
verify_indexes(conn)
)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/employees/management_structure_analysis/description.md
================================================
Conduct a comprehensive management structure analysis to evaluate leadership effectiveness and organizational hierarchy. The executive team needs insights into management tenure, span of control, and leadership transitions to optimize the management structure and succession planning.
## Your Tasks:
1. **Create the manager profile table** — build a table called `manager_profile` in the `employees` schema with these exact columns:
* `manager_id` (bigint) — the manager's employee ID
* `manager_name` (varchar) — concatenated first and last name
* `current_department` (varchar) — current department they manage (NULL if not current)
* `management_periods` (integer) — total number of management assignments (including multiple periods in same department)
* `current_manager` (boolean) — whether they are currently a manager
2. **Create the department leadership table** — build a table called `department_leadership` in the `employees` schema with:
* `department_name` (varchar) — the department name
* `current_manager_name` (varchar) — current manager's full name
* `manager_start_date` (date) — when current manager started
* `total_historical_managers` (integer) — total number of managers this department has had
3. **Create the management transition table** — build a table called `management_transitions` in the `employees` schema with:
* `department_name` (varchar) — the department name
* `transition_year` (integer) — year when management changed
* `outgoing_manager` (varchar) — previous manager's name
* `incoming_manager` (varchar) — new manager's name ('No Successor' if department had no immediate replacement)
* `transition_gap_days` (integer) — days between managers (0 if immediate or no successor)
4. **Create the span of control table** — build a table called `span_of_control` in the `employees` schema with:
* `manager_id` (bigint) — the manager's employee ID
* `manager_name` (varchar) — manager's full name
* `department_name` (varchar) — department they manage
* `total_employees` (integer) — total employees in their department
* `current_employees` (integer) — current active employees in department
* `management_load` (varchar) — assessment ('light', 'moderate', 'heavy') based on current employees
5. **Apply management load classification**:
* **Light**: < 5,000 current employees
* **Moderate**: 5,000 - 15,000 current employees
* **Heavy**: > 15,000 current employees
6. **Focus on current managers only** for span of control analysis — use managers with active management roles (to_date = '9999-01-01').
7. **Track all management history** for profiles and transitions — include both current and former managers to understand complete leadership evolution.
The analysis will provide insights into management effectiveness, departmental stability, and organizational structure optimization opportunities.
================================================
FILE: tasks/postgres/standard/employees/management_structure_analysis/meta.json
================================================
{
"task_id": "management_structure_analysis",
"task_name": "Management Structure Analysis",
"category_id": "employees",
"category_name": "Employees",
"description": "Analyze management structure evaluating leadership effectiveness, span of control, and management transitions for succession planning.",
"author": "Lingxiao Du",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"reporting and analytics",
"statistical aggregation"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Enum \"employees\".\"employee_gender\" {\n \"M\"\n \"F\"\n}\n\nTable \"employees\".\"department\" {\n \"id\" bpchar(4) [pk, not null]\n \"dept_name\" varchar(40) [unique, not null]\n}\n\nTable \"employees\".\"department_employee\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16982_primary\"]\n department_id [type: btree, name: \"idx_16982_dept_no\"]\n }\n}\n\nTable \"employees\".\"department_manager\" {\n \"employee_id\" int8 [not null]\n \"department_id\" bpchar(4) [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, department_id) [type: btree, name: \"idx_16985_primary\"]\n department_id [type: btree, name: \"idx_16985_dept_no\"]\n }\n}\n\nTable \"employees\".\"employee\" {\n \"id\" int8 [pk, not null, increment]\n \"birth_date\" date [not null]\n \"first_name\" varchar(14) [not null]\n \"last_name\" varchar(16) [not null]\n \"gender\" employees.employee_gender [not null]\n \"hire_date\" date [not null]\n}\n\nTable \"employees\".\"salary\" {\n \"employee_id\" int8 [not null]\n \"amount\" int8 [not null]\n \"from_date\" date [not null]\n \"to_date\" date [not null]\n\n Indexes {\n (employee_id, from_date) [type: btree, name: \"idx_16991_primary\"]\n }\n}\n\nTable \"employees\".\"title\" {\n \"employee_id\" int8 [not null]\n \"title\" varchar(50) [not null]\n \"from_date\" date [not null]\n \"to_date\" date\n\n Indexes {\n (employee_id, title, from_date) [type: btree, name: \"idx_16994_primary\"]\n }\n}\n\nRef \"dept_emp_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_employee\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_emp_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_employee\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"department_manager\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"dept_manager_ibfk_2\":\"employees\".\"department\".\"id\" < \"employees\".\"department_manager\".\"department_id\" [update: restrict, delete: cascade]\n\nRef \"salaries_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"salary\".\"employee_id\" [update: restrict, delete: cascade]\n\nRef \"titles_ibfk_1\":\"employees\".\"employee\".\"id\" < \"employees\".\"title\".\"employee_id\" [update: restrict, delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/employees.sql.gz"
}
}
================================================
FILE: tasks/postgres/standard/employees/management_structure_analysis/verify.py
================================================
"""
Verification script for PostgreSQL Task 4: Management Structure Analysis
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.1 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.1:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_manager_profile_results(conn) -> bool:
"""Verify the manager profile results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT manager_id, manager_name, current_department,
management_periods, current_manager
FROM employees.manager_profile
ORDER BY manager_id
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH dm AS (
SELECT dm.employee_id,
dm.department_id,
dm.from_date,
dm.to_date
FROM employees.department_manager dm
),
manager_periods AS (
SELECT employee_id, COUNT(*)::INT AS management_periods
FROM dm
GROUP BY employee_id
),
current_assignment AS (
SELECT employee_id, department_id
FROM (
SELECT d.*,
ROW_NUMBER() OVER (
PARTITION BY d.employee_id
ORDER BY d.from_date DESC, d.department_id
) AS rn
FROM dm d
WHERE d.to_date = DATE '9999-01-01'
) x
WHERE rn = 1
),
manager_names AS (
SELECT e.id AS manager_id,
CONCAT(e.first_name, ' ', e.last_name) AS manager_name
FROM employees.employee e
WHERE EXISTS (SELECT 1 FROM dm WHERE employee_id = e.id)
)
SELECT
mn.manager_id,
mn.manager_name,
d.dept_name AS current_department,
mp.management_periods,
(d.dept_name IS NOT NULL) AS current_manager
FROM manager_names mn
JOIN manager_periods mp ON mp.employee_id = mn.manager_id
LEFT JOIN current_assignment ca ON ca.employee_id = mn.manager_id
LEFT JOIN employees.department d ON d.id = ca.department_id
ORDER BY mn.manager_id;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} manager profile results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Manager profile results are correct ({len(actual_results)} records)")
return True
def verify_department_leadership_results(conn) -> bool:
"""Verify the department leadership results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT department_name, current_manager_name, manager_start_date,
total_historical_managers
FROM employees.department_leadership
ORDER BY department_name
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH current_mgr AS (
SELECT department_id,
CONCAT(e.first_name, ' ', e.last_name) AS current_manager_name,
dm.from_date AS manager_start_date
FROM (
SELECT dm.*,
ROW_NUMBER() OVER (
PARTITION BY dm.department_id
ORDER BY dm.from_date DESC, dm.employee_id
) AS rn
FROM employees.department_manager dm
WHERE dm.to_date = DATE '9999-01-01'
) dm
JOIN employees.employee e ON e.id = dm.employee_id
WHERE dm.rn = 1
),
hist AS (
SELECT dm.department_id, COUNT(DISTINCT dm.employee_id)::INT AS total_historical_managers
FROM employees.department_manager dm
GROUP BY dm.department_id
)
SELECT
d.dept_name AS department_name,
cm.current_manager_name,
cm.manager_start_date,
COALESCE(h.total_historical_managers,0) AS total_historical_managers
FROM employees.department d
LEFT JOIN current_mgr cm ON cm.department_id = d.id
LEFT JOIN hist h ON h.department_id = d.id
ORDER BY d.dept_name;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} department leadership results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Department leadership results are correct ({len(actual_results)} records)")
return True
def verify_management_transitions_results(conn) -> bool:
"""Verify the management transitions results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT department_name, transition_year, outgoing_manager, incoming_manager, transition_gap_days
FROM employees.management_transitions
ORDER BY department_name, transition_year
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH mgr AS (
SELECT
d.id AS department_id,
d.dept_name,
dm.employee_id,
dm.from_date,
dm.to_date,
CONCAT(e.first_name, ' ', e.last_name) AS manager_name
FROM employees.department_manager dm
JOIN employees.department d ON d.id = dm.department_id
JOIN employees.employee e ON e.id = dm.employee_id
),
ordered AS (
SELECT
department_id,
dept_name,
employee_id,
manager_name,
from_date,
to_date,
ROW_NUMBER() OVER (
PARTITION BY department_id
ORDER BY from_date, to_date, employee_id
) AS rn,
LEAD(manager_name) OVER (
PARTITION BY department_id
ORDER BY from_date, to_date, employee_id
) AS next_manager_name,
LEAD(from_date) OVER (
PARTITION BY department_id
ORDER BY from_date, to_date, employee_id
) AS next_from_date
FROM mgr
)
SELECT
o.dept_name AS department_name,
EXTRACT(YEAR FROM o.to_date)::INT AS transition_year,
o.manager_name AS outgoing_manager,
COALESCE(o.next_manager_name, 'No Successor') AS incoming_manager,
COALESCE(GREATEST((o.next_from_date - o.to_date - 1), 0), 0)::INT AS transition_gap_days
FROM ordered o
WHERE o.to_date <> DATE '9999-01-01'
ORDER BY department_name, transition_year;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} management transitions results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Management transitions results are correct ({len(actual_results)} records)")
return True
def verify_span_of_control_results(conn) -> bool:
"""Verify the span of control results."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT manager_id, manager_name, department_name, total_employees,
current_employees, management_load
FROM employees.span_of_control
ORDER BY manager_id
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH dept_total AS (
SELECT de.department_id, COUNT(DISTINCT de.employee_id)::INT AS total_employees
FROM employees.department_employee de
GROUP BY de.department_id
),
dept_current AS (
SELECT de.department_id, COUNT(DISTINCT de.employee_id)::INT AS current_employees
FROM employees.department_employee de
JOIN employees.salary s
ON s.employee_id = de.employee_id
AND s.to_date = DATE '9999-01-01'
WHERE de.to_date = DATE '9999-01-01'
GROUP BY de.department_id
)
SELECT
dm.employee_id AS manager_id,
CONCAT(e.first_name, ' ', e.last_name) AS manager_name,
d.dept_name AS department_name,
COALESCE(dt.total_employees, 0) AS total_employees,
COALESCE(dc.current_employees, 0) AS current_employees,
CASE
WHEN COALESCE(dc.current_employees, 0) < 5000 THEN 'light'
WHEN COALESCE(dc.current_employees, 0) <= 15000 THEN 'moderate'
ELSE 'heavy'
END AS management_load
FROM employees.department_manager dm
JOIN employees.employee e ON e.id = dm.employee_id
JOIN employees.department d ON d.id = dm.department_id
LEFT JOIN dept_total dt ON dt.department_id = dm.department_id
LEFT JOIN dept_current dc ON dc.department_id = dm.department_id
WHERE dm.to_date = DATE '9999-01-01'
ORDER BY dm.employee_id, d.dept_name;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} span of control results, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches: {mismatches}")
return False
print(f"✅ Span of control results are correct ({len(actual_results)} records)")
return True
def main():
"""Main verification function."""
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all four analysis results
success = (
verify_manager_profile_results(conn) and
verify_department_leadership_results(conn) and
verify_management_transitions_results(conn) and
verify_span_of_control_results(conn)
)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/lego/consistency_enforcement/description.md
================================================
Implement a data consistency enforcement system for the LEGO database. The system must ensure that the reported part count in the `lego_sets` table matches the actual sum of non-spare parts in the latest inventory version. This involves a three-step process: identifying existing inconsistencies, fixing them, and creating a trigger-based constraint system to prevent future issues.
### Consistency Rule
For any given `set_num`, the following invariant must be maintained:
`lego_sets.num_parts = SUM(quantity)` FROM `lego_inventory_parts` WHERE `inventory_id` IN (latest inventory for that set) AND `is_spare` = false
**Important**: If a set has no inventory records, the consistency check should be skipped.
# Your Tasks:
## Task 1: Identify Data Inconsistencies
### Objective
Write a single `SELECT` query to find all sets where the stored `num_parts` does not match the actual calculated number of parts from the latest inventory.
1. **Find the Latest Inventory**: For each `set_num`, find its latest inventory id by getting the `MAX(version)` from the `lego_inventories` table.
2. **Calculate Actual Part Count**: For these latest inventories, join with `lego_inventory_parts` and calculate the `SUM(quantity)`, but only for parts where `is_spare` is false.
3. **Compare and Filter**: Join this calculated result back to the `lego_sets` table and return the rows where `lego_sets.num_parts` is different from your calculated sum.
## Task 2: Fix Existing Inconsistencies
### Objective
Correct all mismatched `num_parts` values using a clear, multi-step process with a temporary table. This approach is designed to be robust against all edge cases.
#### Step 1: Create a Temporary Table
Create a temporary table (e.g., `correct_counts`) with two columns: `set_num` (text) and `actual_parts` (integer).
#### Step 2: Populate the Temporary Table
This is the most critical step. Write an `INSERT` statement that calculates the correct part count for every single set listed in the `lego_sets` table.
- The query must start by selecting from `public.lego_sets`.
- It must then `LEFT JOIN` to a subquery that contains the part-counting logic (finding the latest inventory version and summing the non-spare parts).
- Use `COALESCE` on the final result from the subquery to ensure that any set without parts or without an inventory record gets a value of `0`, not `NULL`.
#### Step 3: Update from the Temporary Table
Write a final, simple `UPDATE` statement that joins the `lego_sets` table with your temporary table on `set_num` and sets `num_parts` to the `actual_parts` value.
## Task 3: Create Constraint Enforcement System
### Objective
Implement a deferrable constraint trigger system to enforce the consistency rule automatically for all future `INSERT` and `UPDATE` operations.
### Part A: Create the Trigger Function
Create a single PL/pgSQL function, preferably named `check_set_parts_consistency()`, that performs the core validation.
**Function Requirements**:
- Returns `trigger`.
- Accepts no arguments.
- Contains the core validation logic:
- **Identify the `set_num` to check**. This is the most critical part. The `set_num` must be retrieved based on which table fired the trigger (`TG_TABLE_NAME`):
- If `lego_sets` or `lego_inventories`: get the `set_num` directly from `NEW.set_num`.
- If `lego_inventory_parts`: you must first query `lego_inventories` using `NEW.inventory_id` to find the corresponding `set_num`.
- **Perform the check**. For the identified `set_num`, execute the same core logic from Task 1 to get the `actual_parts` count and the `stored_num_parts` from the `lego_sets` table.
- **Raise an exception on failure**. If `actual_parts` does not equal `stored_num_parts`, the function must raise an exception to block the transaction (e.g., `RAISE EXCEPTION 'Inconsistent part count for set %', relevant_set_num;`).
- **Return `NEW` on success**. If the check passes or is skipped, the function should `RETURN NEW`.
### Part B: Create the Constraint Triggers
Create three separate `CONSTRAINT TRIGGER` statements that attach the function from Part A to the following tables:
- `public.lego_sets`
- `public.lego_inventories`
- `public.lego_inventory_parts`
**Crucial Trigger Requirements**:
- Each trigger must fire `AFTER INSERT OR UPDATE`.
- Each trigger **MUST** be `DEFERRABLE` and `INITIALLY IMMEDIATE`. This is non-negotiable for the verification to pass.
- Each trigger must execute the function `FOR EACH ROW`.
================================================
FILE: tasks/postgres/standard/lego/consistency_enforcement/meta.json
================================================
{
"task_id": "consistency_enforcement",
"task_name": "Consistency Enforcement",
"category_id": "lego",
"category_name": "Lego",
"description": "Implement data consistency system ensuring reported part counts match actual inventory using triggers and constraint enforcement.",
"author": "Jiawei Wang",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"data integrity enforcement",
"stored procedures and functions",
"transactional operations"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
}
}
================================================
FILE: tasks/postgres/standard/lego/consistency_enforcement/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 1: Parts Consistency Fix & Constraints
Version 2.1: Relaxed consistency check to allow for one known corner case mismatch.
"""
import os
import sys
import psycopg2
import psycopg2.errors
from typing import Optional, Tuple, List
def get_connection_params() -> dict:
"""Get database connection parameters from environment variables."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD"),
}
def fetch_candidate_part_row(cur) -> Optional[Tuple[int, str, str, int]]:
"""
Picks a concrete, non-spare inventory part from the latest inventory of any set.
This provides a reliable target for testing update and insert triggers.
Returns a tuple: (inventory_id, set_num, part_num, color_id) or None.
"""
cur.execute(
"""
WITH latest_inv AS (
SELECT set_num, MAX(version) AS max_version
FROM public.lego_inventories
GROUP BY set_num
), inv AS (
SELECT li.id, li.set_num
FROM public.lego_inventories li
JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version
)
SELECT i.id AS inventory_id, i.set_num, lip.part_num, lip.color_id
FROM inv i
JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id
WHERE lip.is_spare = false AND lip.quantity > 0
LIMIT 1;
"""
)
return cur.fetchone()
def get_mismatch_count(cur) -> int:
"""Returns the number of sets where num_parts mismatches the computed actual sum."""
cur.execute(
"""
WITH latest_inv AS (
SELECT set_num, MAX(version) AS max_version
FROM public.lego_inventories
GROUP BY set_num
), inv_latest AS (
SELECT li.set_num, li.id
FROM public.lego_inventories li
JOIN latest_inv lv ON lv.set_num = li.set_num AND lv.max_version = li.version
), parts_agg AS (
SELECT
i.set_num,
SUM(lip.quantity) AS actual_parts
FROM inv_latest i
JOIN public.lego_inventory_parts lip ON lip.inventory_id = i.id
WHERE lip.is_spare = false
GROUP BY i.set_num
)
SELECT COUNT(*)
FROM public.lego_sets s
LEFT JOIN parts_agg pa ON s.set_num = pa.set_num
WHERE s.num_parts <> COALESCE(pa.actual_parts, 0);
"""
)
return cur.fetchone()[0]
def verify_data_consistency(conn) -> bool:
"""
TASK 1 VERIFICATION: Checks if the initial data fix was successful.
(Relaxed: Allows for one corner-case mismatch).
"""
print("\n-- Verifying Task 1: Data Consistency Fix (Relaxed) --")
with conn.cursor() as cur:
count = get_mismatch_count(cur)
# RELAXED CONDITION: Allow 0 or 1 mismatch to pass.
if count > 1:
print(f"❌ FAIL: Found {count} sets with inconsistent part counts. Expected 0 or 1 after fix.")
return False
print("✅ PASS: Data consistency check passed (allowing for one known mismatch).")
return True
def verify_constraint_triggers_exist(conn) -> bool:
"""
TASK 2 VERIFICATION (Part A): Checks if constraint triggers are attached to all required tables.
This is more robust than checking names or a total count.
"""
print("\n-- Verifying Task 2: Constraint Trigger Existence --")
tables_to_check = [
'public.lego_inventory_parts',
'public.lego_inventories',
'public.lego_sets'
]
all_triggers_found = True
with conn.cursor() as cur:
for table in tables_to_check:
cur.execute(
"""
SELECT COUNT(*)
FROM pg_trigger
WHERE tgrelid = %s::regclass AND tgconstraint <> 0;
""",
(table,)
)
trigger_count = cur.fetchone()[0]
if trigger_count == 0:
print(f"❌ FAIL: No constraint trigger found on table '{table}'.")
all_triggers_found = False
else:
print(f"✅ OK: Found constraint trigger(s) on table '{table}'.")
if all_triggers_found:
print("✅ PASS: Constraint triggers are attached to all required tables.")
return all_triggers_found
def verify_violation_is_blocked(conn) -> bool:
"""
TASK 2 VERIFICATION (Part B): Checks if triggers block a direct, inconsistent write.
An attempt to increment a part quantity without updating the set's total should fail.
"""
print("\n-- Verifying Task 2: Immediate Constraint Enforcement --")
with conn.cursor() as cur:
candidate = fetch_candidate_part_row(cur)
if not candidate:
print("⚠️ SKIP: No candidate part row found to test constraints. Cannot verify.")
return True # Skip if no data to test
inventory_id, _, part_num, color_id = candidate
try:
# This transaction should fail due to the trigger
cur.execute(
"""
UPDATE public.lego_inventory_parts
SET quantity = quantity + 1
WHERE inventory_id = %s AND part_num = %s AND color_id = %s;
""",
(inventory_id, part_num, color_id),
)
# If we reach here, the trigger failed to block the update.
conn.rollback()
print("❌ FAIL: An inconsistent write was NOT blocked by the trigger.")
return False
except psycopg2.Error as e:
# We expect an error. Specifically, a constraint violation error.
conn.rollback()
# 23514 is check_violation, but custom triggers might raise others.
# Any error here is considered a success as the transaction was blocked.
print(f"✅ PASS: Inconsistent write was correctly blocked by the trigger. (Error: {e.pgcode})")
return True
def verify_deferred_transaction_is_allowed(conn) -> bool:
"""
TASK 2 VERIFICATION (Part C): Checks if a coordinated, consistent update is allowed
when constraints are deferred.
"""
print("\n-- Verifying Task 2: Deferred Constraint Enforcement --")
with conn.cursor() as cur:
candidate = fetch_candidate_part_row(cur)
if not candidate:
print("⚠️ SKIP: No candidate part row found. Cannot test deferred transaction.")
return True # Skip if no data to test
inventory_id, set_num, part_num, color_id = candidate
try:
# This multi-statement transaction should succeed with deferred constraints
with conn.cursor() as cur:
cur.execute("BEGIN;")
cur.execute("SET CONSTRAINTS ALL DEFERRED;")
cur.execute(
"UPDATE public.lego_inventory_parts SET quantity = quantity + 1 WHERE inventory_id = %s AND part_num = %s AND color_id = %s;",
(inventory_id, part_num, color_id),
)
cur.execute(
"UPDATE public.lego_sets SET num_parts = num_parts + 1 WHERE set_num = %s;",
(set_num,),
)
cur.execute("COMMIT;") # This will fail if constraints are not deferrable or logic is wrong
print("✅ PASS: Coordinated update with deferred constraints committed successfully.")
# Revert changes to leave DB in its original state
with conn.cursor() as cur:
cur.execute("BEGIN;")
cur.execute("SET CONSTRAINTS ALL DEFERRED;")
cur.execute(
"UPDATE public.lego_inventory_parts SET quantity = quantity - 1 WHERE inventory_id = %s AND part_num = %s AND color_id = %s;",
(inventory_id, part_num, color_id),
)
cur.execute(
"UPDATE public.lego_sets SET num_parts = num_parts - 1 WHERE set_num = %s;",
(set_num,),
)
cur.execute("COMMIT;")
print("INFO: Test changes were successfully reverted.")
return True
except psycopg2.Error as e:
conn.rollback()
print(f"❌ FAIL: Deferred transaction failed to commit. Error: {e}")
return False
def main():
"""Main verification function."""
print("=" * 60)
print("LEGO Database Consistency Verification Script")
print("=" * 60)
conn_params = get_connection_params()
if not conn_params.get("database"):
print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
sys.exit(1)
try:
with psycopg2.connect(**conn_params) as conn:
conn.autocommit = False # Ensure we control transactions
# Run all verification steps
results = [
verify_data_consistency(conn),
verify_constraint_triggers_exist(conn),
verify_violation_is_blocked(conn),
verify_deferred_transaction_is_allowed(conn),
]
if all(results):
print("\n🎉 Overall Result: PASS - All tasks verified successfully!")
sys.exit(0)
else:
print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
sys.exit(1)
except psycopg2.OperationalError as e:
print(f"❌ CRITICAL: Could not connect to the database. Details: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ CRITICAL: An unexpected error occurred during verification. Details: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/lego/database_security_policies/description.md
================================================
Implement a comprehensive database security system with Row-Level Security (RLS) policies and role-based access control for the LEGO database. The system must ensure theme-based data isolation and prevent unauthorized access across different LEGO themes.
## Your Tasks:
1. **Create database role and permissions** — Create a new database role called `theme_analyst` with the following permissions:
* `SELECT` permissions on all reference tables: `lego_themes`, `lego_colors`, `lego_parts`, `lego_part_categories`
* `SELECT` permissions on main data tables: `lego_sets`, `lego_inventories`, `lego_inventory_parts`
* No `INSERT`, `UPDATE`, or `DELETE` permissions on any tables
2. **Enable Row-Level Security** — Enable RLS on the following tables:
* `lego_sets`
* `lego_inventories`
* `lego_inventory_parts`
3. **Create RLS policies** — Implement theme-based data isolation policies:
**Policy 1: `theme_sets_policy` on `lego_sets`**
* Allows access only to sets where `theme_id = 18` (Star Wars theme)
* Policy should use a function that checks the current user's theme assignment
**Policy 2: `theme_inventories_policy` on `lego_inventories`**
* Allows access only to inventories for sets with `theme_id = 18`
* Must join with `lego_sets` table to check theme_id
**Policy 3: `theme_inventory_parts_policy` on `lego_inventory_parts`**
* Allows access only to inventory parts for sets with `theme_id = 18`
* Must join through `lego_inventories` and `lego_sets` to check theme_id
4. **Create theme assignment function** — Create a function `get_user_theme_id()` that:
* Returns `18` for the `theme_analyst` role (Star Wars theme)
* Can be extended to support other themes in the future
* Uses `current_user` to determine the appropriate theme_id
5. **Test the security implementation** — Execute verification queries that demonstrate:
* Star Wars theme (theme_id=18) returns exactly 2 sets: '65081-1' and 'K8008-1'
* Technic theme (theme_id=1) returns 0 sets when accessed by theme_analyst role
* Cross-theme data access is properly blocked
* Reference tables are accessible for all data
6. **Create comprehensive security audit** — Generate a detailed report including:
* Complete SQL statements for role creation and policy implementation
* Expected query results for each theme
* Verification queries to confirm proper data isolation
* Documentation of the security model and access patterns
## Security Requirements:
- The `theme_analyst` role must only see data related to Star Wars theme (theme_id=18)
- All other themes must be completely hidden from this role
- Reference tables (themes, colors, parts, part_categories) must be fully accessible
- The system must prevent any cross-theme data leakage
- RLS policies must be active and enforced for all data access
## Expected Results:
When the `theme_analyst` role queries the database:
- `lego_sets` should return only 2 Star Wars sets
- `lego_inventories` should return only inventories for those 2 sets
- `lego_inventory_parts` should return only parts for those 2 sets
- All reference tables should return complete data
- Queries for other themes should return empty results
================================================
FILE: tasks/postgres/standard/lego/database_security_policies/meta.json
================================================
{
"task_id": "database_security_policies",
"task_name": "Database Security Policies",
"category_id": "lego",
"category_name": "Lego",
"description": "Implement Row-Level Security policies with role-based access control for theme-based data isolation in LEGO database.",
"author": "Jiawei Wang",
"created_at": "2025-08-15",
"difficulty": "L3",
"tags": [
"security and access control",
"stored procedures and functions"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
}
}
================================================
FILE: tasks/postgres/standard/lego/database_security_policies/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 4: Database Security and RLS Implementation
(Version 2 - Improved Robustness)
"""
import os
import sys
import psycopg2
import psycopg2.errors
from typing import Dict
def get_connection_params() -> Dict[str, any]:
"""Get database connection parameters from environment variables."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD"),
}
def verify_role_creation(conn) -> bool:
"""
TASK 1 VERIFICATION: Check if theme_analyst role was created with proper permissions.
"""
print("\n-- Verifying Task 1: Role Creation and Permissions --")
with conn.cursor() as cur:
# Check if role exists
cur.execute("SELECT 1 FROM pg_roles WHERE rolname = 'theme_analyst';")
if not cur.fetchone():
print("❌ FAIL: The 'theme_analyst' role was not created.")
return False
print("✅ OK: Role 'theme_analyst' exists.")
# Check SELECT permissions on reference and main tables
all_tables = [
'lego_themes', 'lego_colors', 'lego_parts', 'lego_part_categories',
'lego_sets', 'lego_inventories', 'lego_inventory_parts'
]
for table in all_tables:
cur.execute(
"""
SELECT has_table_privilege('theme_analyst', %s, 'SELECT');
""",
(table,)
)
if not cur.fetchone()[0]:
print(f"❌ FAIL: 'theme_analyst' role is missing SELECT permission on '{table}'.")
return False
print("✅ OK: Role has correct SELECT permissions on all required tables.")
# Check that no INSERT/UPDATE/DELETE permissions exist
for table in all_tables:
cur.execute(
"""
SELECT
has_table_privilege('theme_analyst', %s, 'INSERT') OR
has_table_privilege('theme_analyst', %s, 'UPDATE') OR
has_table_privilege('theme_analyst', %s, 'DELETE');
""",
(table, table, table)
)
if cur.fetchone()[0]:
print(f"❌ FAIL: 'theme_analyst' role has unauthorized INSERT, UPDATE, or DELETE permission on '{table}'.")
return False
print("✅ OK: Role does not have modification permissions.")
print("✅ PASS: 'theme_analyst' role created with correct permissions.")
return True
def verify_rls_enabled(conn) -> bool:
"""
TASK 2 VERIFICATION: Check if Row-Level Security is enabled on required tables.
"""
print("\n-- Verifying Task 2: Row-Level Security Enablement --")
tables_to_check = ['lego_sets', 'lego_inventories', 'lego_inventory_parts']
with conn.cursor() as cur:
for table in tables_to_check:
cur.execute(
"SELECT relrowsecurity FROM pg_class WHERE relname = %s;", (table,)
)
rls_enabled = cur.fetchone()
if not rls_enabled or not rls_enabled[0]:
print(f"❌ FAIL: RLS is not enabled on table '{table}'.")
return False
print(f"✅ OK: RLS is enabled on table '{table}'.")
print("✅ PASS: Row-Level Security is enabled on all required tables.")
return True
def verify_rls_policies(conn) -> bool:
"""
TASK 3 VERIFICATION: Check if RLS policies were created on required tables.
"""
print("\n-- Verifying Task 3: RLS Policy Creation --")
expected_policies = {
'lego_sets': 'theme_sets_policy',
'lego_inventories': 'theme_inventories_policy',
'lego_inventory_parts': 'theme_inventory_parts_policy'
}
with conn.cursor() as cur:
for table, policy_name in expected_policies.items():
cur.execute(
"SELECT 1 FROM pg_policies WHERE tablename = %s AND policyname = %s;",
(table, policy_name)
)
if not cur.fetchone():
print(f"❌ FAIL: RLS policy '{policy_name}' not found on table '{table}'.")
return False
print(f"✅ OK: RLS policy '{policy_name}' found on table '{table}'.")
print("✅ PASS: All required RLS policies are created.")
return True
def verify_theme_function(conn) -> bool:
"""
TASK 4 VERIFICATION: Check if get_user_theme_id() function was created and works correctly.
"""
print("\n-- Verifying Task 4: Theme Assignment Function --")
with conn.cursor() as cur:
cur.execute(
"SELECT 1 FROM pg_proc WHERE proname = 'get_user_theme_id';"
)
if not cur.fetchone():
print("❌ FAIL: The 'get_user_theme_id' function was not created.")
return False
print("✅ OK: Function 'get_user_theme_id' exists.")
try:
# Test the function's output specifically for the 'theme_analyst' role
cur.execute("SET ROLE theme_analyst;")
cur.execute("SELECT get_user_theme_id();")
theme_id = cur.fetchone()[0]
cur.execute("RESET ROLE;") # IMPORTANT: Switch back
if theme_id != 18:
print(f"❌ FAIL: get_user_theme_id() returned {theme_id} for 'theme_analyst', but expected 18.")
return False
print("✅ OK: Function returns correct theme_id (18) for 'theme_analyst'.")
print("✅ PASS: Theme assignment function is correct.")
return True
except Exception as e:
conn.rollback() # Rollback any failed transaction state
print(f"❌ FAIL: Error testing get_user_theme_id() function: {e}")
return False
def test_theme_analyst_access(conn) -> bool:
"""
TASK 5 VERIFICATION: Test data access by assuming the theme_analyst role.
"""
print("\n-- Verifying Task 5: Theme-Based Data Access --")
try:
with conn.cursor() as cur:
# Assume the role of theme_analyst for this session
cur.execute("SET ROLE theme_analyst;")
# Test 1: Check Star Wars sets access (should return 2 sets)
cur.execute("SELECT set_num FROM lego_sets ORDER BY set_num;")
star_wars_sets = [row[0] for row in cur.fetchall()]
expected_sets = ['65081-1', 'K8008-1']
if sorted(star_wars_sets) != sorted(expected_sets):
print(f"❌ FAIL: Expected Star Wars sets {expected_sets}, but got {star_wars_sets}.")
cur.execute("RESET ROLE;")
return False
print("✅ PASS: Star Wars sets access is correct (2 sets returned).")
# Test 2: Check that Technic sets are not accessible (should return 0)
cur.execute("SELECT COUNT(*) FROM lego_sets WHERE theme_id = 1;")
technic_count = cur.fetchone()[0]
if technic_count != 0:
print(f"❌ FAIL: Technic sets should be blocked, but query returned {technic_count} sets.")
cur.execute("RESET ROLE;")
return False
print("✅ PASS: Technic theme is correctly blocked (0 sets returned).")
# Test 3: Check reference tables are fully accessible
cur.execute("SELECT COUNT(*) > 10 FROM lego_themes;") # Check for a reasonable number
if not cur.fetchone()[0]:
print("❌ FAIL: 'lego_themes' table seems inaccessible or empty.")
cur.execute("RESET ROLE;")
return False
print("✅ PASS: Reference tables appear to be accessible.")
# Test 4 & 5: Check related tables
cur.execute("SELECT COUNT(*) FROM lego_inventories;")
if cur.fetchone()[0] == 0:
print("❌ FAIL: No inventories are visible for the allowed sets.")
cur.execute("RESET ROLE;")
return False
cur.execute("SELECT COUNT(*) FROM lego_inventory_parts;")
if cur.fetchone()[0] == 0:
print("❌ FAIL: No inventory parts are visible for the allowed sets.")
cur.execute("RESET ROLE;")
return False
print("✅ PASS: Related tables (inventories, inventory_parts) are correctly filtered.")
# IMPORTANT: Always reset the role at the end
cur.execute("RESET ROLE;")
return True
except Exception as e:
conn.rollback() # Ensure transaction is clean
print(f"❌ FAIL: An error occurred while testing data access as 'theme_analyst': {e}")
# Try to reset role even on failure to clean up session state
try:
with conn.cursor() as cleanup_cur:
cleanup_cur.execute("RESET ROLE;")
except:
pass
return False
def main():
"""Main verification function."""
print("=" * 60)
print("LEGO Database Security and RLS Verification Script")
print("=" * 60)
conn_params = get_connection_params()
if not conn_params.get("database"):
print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
sys.exit(1)
conn = None
try:
conn = psycopg2.connect(**conn_params)
results = [
verify_role_creation(conn),
verify_rls_enabled(conn),
verify_rls_policies(conn),
verify_theme_function(conn),
test_theme_analyst_access(conn),
]
if all(results):
print("\n🎉 Overall Result: PASS - All security tasks verified successfully!")
sys.exit(0)
else:
print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
sys.exit(1)
except psycopg2.OperationalError as e:
print(f"❌ CRITICAL: Could not connect to the database. Check credentials and host. Details: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}")
sys.exit(1)
finally:
if conn:
conn.close()
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/description.md
================================================
Create a PostgreSQL function to handle inventory part transfers between LEGO sets with enhanced validation and audit capabilities. The LEGO warehouse management system needs to support transferring parts while maintaining data integrity and tracking transfer history.
## Your Tasks:
1. **Create the transfer function** — Implement a PostgreSQL function named `transfer_parts` with the following signature:
```sql
CREATE OR REPLACE FUNCTION transfer_parts(
source_inventory_id INTEGER,
target_inventory_id INTEGER,
part_to_transfer_num VARCHAR,
color_to_transfer_id INTEGER,
quantity_to_transfer INTEGER,
transfer_reason VARCHAR DEFAULT 'manual_transfer'
) RETURNS TEXT
```
2. **Create audit logging table** — Create a new table to track transfer history:
```sql
CREATE TABLE inventory_transfer_log (
log_id SERIAL PRIMARY KEY,
transfer_timestamp TIMESTAMP DEFAULT NOW(),
source_inventory_id INTEGER NOT NULL,
target_inventory_id INTEGER NOT NULL,
part_num VARCHAR NOT NULL,
color_id INTEGER NOT NULL,
quantity_transferred INTEGER NOT NULL,
transfer_reason VARCHAR NOT NULL,
transfer_status VARCHAR NOT NULL CHECK (transfer_status IN ('success', 'failed')),
error_message TEXT
);
```
3. **Implement enhanced validation** — The function must perform these validations:
**Validation A: Basic Checks**
- Verify both inventory IDs exist in `lego_inventories` table
- Verify part exists in `lego_parts` table
- Verify color exists in `lego_colors` table
- Check source has sufficient quantity (including spare parts)
- Prevent self-transfers (source and target cannot be the same)
**Validation B: Business Rules**
- Maximum transfer quantity is 500 parts per operation
- Minimum transfer quantity is 1 part
- Source and target must be different inventories
4. **Implement transactional logic** — The function must perform these operations within a single transaction:
**Step A: Pre-validation**
- Lock both inventory records using `SELECT ... FOR UPDATE`
- Perform all validation checks
- Calculate transfer feasibility
**Step B: Source Inventory Update**
- Decrease quantity in source inventory
- If quantity becomes zero, delete the row
- Handle spare parts appropriately (maintain `is_spare` flag)
**Step C: Target Inventory Update**
- Check if part exists in target inventory
- If exists: increase quantity
- If not exists: insert new record
- Handle spare parts appropriately
**Step D: Audit Logging**
- Log successful transfers with details
- Log failed transfers with error messages
- Include transfer reason and status
5. **Error handling requirements**:
- Use `RAISE EXCEPTION` with descriptive error messages
- Handle all validation failures gracefully
- Ensure complete rollback on any failure
- Log all attempts (successful and failed)
6. **Return value**:
- Return success message: `'Successfully transferred {quantity} parts ({part_num}, color_id: {color_id}) from inventory {source_id} to inventory {target_id}. Reason: {reason}'`
- Include transfer details and reason in the message
## Function Requirements:
- **Transaction Safety**: All operations wrapped in transaction block
- **Data Integrity**: No partial updates possible
- **Audit Trail**: Complete logging of all transfer attempts
- **Validation**: Comprehensive input and business rule validation
- **Error Recovery**: Failed transfers leave database unchanged
- **Performance**: Use appropriate locking to prevent race conditions
## Example Usage:
```sql
-- Basic transfer with reason
SELECT transfer_parts(14469, 14686, '3024', 15, 100, 'inventory_adjustment');
-- Transfer to new inventory (should create new record)
SELECT transfer_parts(11124, 14686, '3001', 4, 50, 'part_redistribution');
-- This should fail due to insufficient quantity
SELECT transfer_parts(14469, 14686, '3024', 15, 2000, 'large_transfer');
-- This should fail due to self-transfer
SELECT transfer_parts(14469, 14469, '3024', 15, 10, 'self_transfer');
```
## Verification Criteria:
- Function handles all validation rules correctly
- Audit logging captures all transfer attempts
- Failed transfers are properly logged with error details
- Self-transfers are prevented
- Quantity limits are enforced
- Database state remains consistent after failures
================================================
FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/meta.json
================================================
{
"task_id": "transactional_inventory_transfer",
"task_name": "Transactional Inventory Transfer",
"category_id": "lego",
"category_name": "Lego",
"description": "Create PostgreSQL function to handle inventory part transfers between LEGO sets with validation and audit logging.",
"author": "Jiawei Wang",
"created_at": "2025-08-16",
"difficulty": "L3",
"tags": [
"transactional operations",
"stored procedures and functions",
"audit and compliance"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"lego_colors\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"rgb\" varchar(6) [not null]\n \"is_trans\" bpchar(1) [not null]\n}\n\nTable \"lego_inventories\" {\n \"id\" int4 [pk, not null, increment]\n \"version\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n}\n\nTable \"lego_inventory_parts\" {\n \"inventory_id\" int4 [not null]\n \"part_num\" varchar(255) [not null]\n \"color_id\" int4 [not null]\n \"quantity\" int4 [not null]\n \"is_spare\" bool [not null]\n}\n\nTable \"lego_inventory_sets\" {\n \"inventory_id\" int4 [not null]\n \"set_num\" varchar(255) [not null]\n \"quantity\" int4 [not null]\n}\n\nTable \"lego_part_categories\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n}\n\nTable \"lego_parts\" {\n \"part_num\" varchar(255) [pk, not null]\n \"name\" text [not null]\n \"part_cat_id\" int4 [not null]\n}\n\nTable \"lego_sets\" {\n \"set_num\" varchar(255) [pk, not null]\n \"name\" varchar(255) [not null]\n \"year\" int4\n \"theme_id\" int4\n \"num_parts\" int4\n}\n\nTable \"lego_themes\" {\n \"id\" int4 [pk, not null, increment]\n \"name\" varchar(255) [not null]\n \"parent_id\" int4\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/neondatabase-labs/postgres-sample-dbs/blob/main/lego.sql"
}
}
================================================
FILE: tasks/postgres/standard/lego/transactional_inventory_transfer/verify.py
================================================
"""
Verification script for PostgreSQL LEGO Task 2: Enhanced Inventory Transfer Function
Tests the transfer_parts function with audit logging and enhanced validation.
Key Features Tested:
- Core transfer functionality with audit logging
- Business rule validation (quantity limits, self-transfer prevention)
- Error handling and rollback mechanisms
- Audit trail maintenance for both success and failure cases
"""
import os
import sys
import psycopg2
import psycopg2.errors
from typing import Optional, Tuple
def get_connection_params() -> dict:
"""Get database connection parameters from environment variables."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD"),
}
def get_inventory_part_quantity(conn, inventory_id: int, part_num: str, color_id: int) -> int:
"""Get the current quantity of a specific part in an inventory."""
with conn.cursor() as cur:
cur.execute(
"""
SELECT quantity FROM public.lego_inventory_parts
WHERE inventory_id = %s AND part_num = %s AND color_id = %s
""",
(inventory_id, part_num, color_id)
)
result = cur.fetchone()
return result[0] if result else 0
def verify_system_components(conn) -> bool:
"""Verify that all required system components exist."""
print("\n-- Verifying System Components --")
try:
with conn.cursor() as cur:
# Check main function
cur.execute(
"""
SELECT COUNT(*) FROM pg_proc p
JOIN pg_namespace n ON p.pronamespace = n.oid
WHERE n.nspname = 'public' AND p.proname = 'transfer_parts'
"""
)
main_func_count = cur.fetchone()[0]
# Check audit table
cur.execute(
"""
SELECT COUNT(*) FROM information_schema.tables
WHERE table_schema = 'public' AND table_name = 'inventory_transfer_log'
"""
)
audit_table_count = cur.fetchone()[0]
if main_func_count == 0:
print("❌ FAIL: transfer_parts function does not exist")
return False
if audit_table_count == 0:
print("❌ FAIL: inventory_transfer_log table does not exist")
return False
print("✅ PASS: All system components exist")
return True
finally:
conn.rollback()
def verify_successful_transfer_with_audit(conn) -> bool:
"""Test a successful transfer with audit logging."""
print("\n-- Verifying Successful Transfer with Audit --")
passed = False
try:
# Test data: Transfer 100 white plates from Mosaic Dino to Mosaic Johnny Thunder
source_id = 14469
target_id = 14686
part_num = '3024'
color_id = 15
transfer_qty = 100
reason = 'inventory_adjustment'
source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id)
target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}")
# Get initial audit log count
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
initial_log_count = cur.fetchone()[0]
with conn.cursor() as cur:
cur.execute(
"SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
(source_id, target_id, part_num, color_id, transfer_qty, reason)
)
result = cur.fetchone()
print(f"Transfer result: {result[0]}")
source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id)
target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
print(f"Final quantities - Source: {source_final}, Target: {target_final}")
# Verify audit log entry
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
final_log_count = cur.fetchone()[0]
if final_log_count <= initial_log_count:
print("❌ FAIL: No audit log entry was created")
return False
# Check latest audit entry
cur.execute(
"""
SELECT transfer_status, quantity_transferred, transfer_reason
FROM inventory_transfer_log
ORDER BY log_id DESC
LIMIT 1
"""
)
audit_entry = cur.fetchone()
if not audit_entry:
print("❌ FAIL: Could not retrieve audit log entry")
return False
status, qty_transferred, trans_reason = audit_entry
if status != 'success':
print(f"❌ FAIL: Transfer status should be 'success', got '{status}'")
return False
if qty_transferred != transfer_qty or trans_reason != reason:
print(f"❌ FAIL: Audit log details don't match transfer parameters")
return False
expected_source = source_initial - transfer_qty
expected_target = target_initial + transfer_qty
if source_final != expected_source:
print(f"❌ FAIL: Source quantity mismatch. Expected {expected_source}, got {source_final}")
elif target_final != expected_target:
print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}")
else:
print("✅ PASS: Successful transfer with audit logging completed correctly")
passed = True
except psycopg2.Error as e:
print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}")
finally:
conn.rollback()
return passed
def verify_new_part_transfer(conn) -> bool:
"""Test transferring a part to an inventory that doesn't have it."""
print("\n-- Verifying New Part Transfer --")
passed = False
try:
# Test data: Transfer red bricks to Mosaic Johnny Thunder (which doesn't have them)
source_id = 11124 # Giant Lego Dacta Basic Set (has red bricks)
target_id = 14686 # Lego Mosaic Johnny Thunder (doesn't have red bricks)
part_num = '3001'
color_id = 4
transfer_qty = 50
reason = 'part_redistribution'
target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
if target_initial != 0:
print(f"❌ FAIL: Pre-condition failed. Target already has {target_initial} of this part, expected 0")
return False
source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id)
print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}")
with conn.cursor() as cur:
cur.execute(
"SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
(source_id, target_id, part_num, color_id, transfer_qty, reason)
)
result = cur.fetchone()
print(f"Transfer result: {result[0]}")
source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id)
target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
print(f"Final quantities - Source: {source_final}, Target: {target_final}")
expected_source = source_initial - transfer_qty
expected_target = transfer_qty
if source_final != expected_source:
print(f"❌ FAIL: Source quantity mismatch. Expected {expected_source}, got {source_final}")
elif target_final != expected_target:
print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}")
else:
print("✅ PASS: New part transfer completed correctly")
passed = True
except psycopg2.Error as e:
print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}")
finally:
conn.rollback()
return passed
def verify_business_rule_validation(conn) -> bool:
"""Test business rule validation including quantity limits and self-transfer prevention."""
print("\n-- Verifying Business Rule Validation --")
# Test 1: Self-transfer (should fail)
print("Test 1: Self-transfer (should fail)")
test1_passed = False
try:
source_id = 14469
part_num = '3024'
color_id = 15
transfer_qty = 10
reason = 'self_transfer'
with conn.cursor() as cur:
cur.execute(
"SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
(source_id, source_id, part_num, color_id, transfer_qty, reason)
)
result = cur.fetchone()
print(f"❌ FAIL: Self-transfer should have failed but succeeded: {result[0]}")
except psycopg2.Error:
print(f"✅ PASS: Self-transfer correctly failed")
test1_passed = True
except Exception as e:
print(f"❌ FAIL: Self-transfer test failed with unexpected error: {e}")
finally:
conn.rollback() # Rollback after first test
# Test 2: Transfer quantity exceeds maximum (should fail)
print("Test 2: Transfer quantity exceeds maximum (should fail)")
test2_passed = False
try:
source_id = 14469
target_id = 14686
part_num = '3024'
color_id = 15
with conn.cursor() as cur:
cur.execute(
"SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
(source_id, target_id, part_num, color_id, 600, 'large_transfer')
)
result = cur.fetchone()
print(f"❌ FAIL: Large transfer should have failed but succeeded: {result[0]}")
except psycopg2.Error:
print(f"✅ PASS: Large transfer correctly failed")
test2_passed = True
except Exception as e:
print(f"❌ FAIL: Large transfer test failed with unexpected error: {e}")
finally:
conn.rollback() # Rollback after second test
# Test 3: Transfer quantity below minimum (should fail)
print("Test 3: Transfer quantity below minimum (should fail)")
test3_passed = False
try:
source_id = 14469
target_id = 14686
part_num = '3024'
color_id = 15
with conn.cursor() as cur:
cur.execute(
"SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
(source_id, target_id, part_num, color_id, 0, 'zero_transfer')
)
result = cur.fetchone()
print(f"❌ FAIL: Zero transfer should have failed but succeeded: {result[0]}")
except psycopg2.Error:
print(f"✅ PASS: Zero transfer correctly failed")
test3_passed = True
except Exception as e:
print(f"❌ FAIL: Zero transfer test failed with unexpected error: {e}")
finally:
conn.rollback() # Rollback after third test
return test1_passed and test2_passed and test3_passed
def verify_insufficient_quantity_error(conn) -> bool:
"""Test that transfer fails when source has insufficient quantity."""
print("\n-- Verifying Insufficient Quantity Error --")
passed = False
try:
source_id = 14469
target_id = 14686
part_num = '3024'
color_id = 15
transfer_qty = 99999 # Far more than available
reason = 'insufficient_test'
source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id)
target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}")
with conn.cursor() as cur:
try:
cur.execute(
"SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
(source_id, target_id, part_num, color_id, transfer_qty, reason)
)
result = cur.fetchone()
print(f"❌ FAIL: Transfer should have failed but succeeded: {result[0]}")
except psycopg2.Error as e:
print(f"✅ PASS: Transfer correctly failed with an exception.")
# After an exception, the transaction is in an aborted state. Must rollback before new queries.
conn.rollback()
source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id)
target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
if source_final != source_initial:
print(f"❌ FAIL: Source quantity changed from {source_initial} to {source_final}")
elif target_final != target_initial:
print(f"❌ FAIL: Target quantity changed from {target_initial} to {target_final}")
else:
print("✅ PASS: Database state unchanged after failed transfer")
passed = True
finally:
conn.rollback()
return passed
def verify_invalid_inventory_error(conn) -> bool:
"""Test that transfer fails with invalid inventory IDs."""
print("\n-- Verifying Invalid Inventory Error --")
passed = False
try:
source_id = 99999 # Non-existent inventory
target_id = 14686
part_num = '3024'
color_id = 15
transfer_qty = 10
reason = 'invalid_test'
target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
with conn.cursor() as cur:
try:
cur.execute(
"SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
(source_id, target_id, part_num, color_id, transfer_qty, reason)
)
result = cur.fetchone()
print(f"❌ FAIL: Transfer should have failed but succeeded: {result[0]}")
except psycopg2.Error as e:
print(f"✅ PASS: Transfer correctly failed with an exception.")
# Rollback the aborted transaction
conn.rollback()
target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
if target_final != target_initial:
print(f"❌ FAIL: Target quantity changed from {target_initial} to {target_final}")
else:
print("✅ PASS: Database state unchanged after invalid inventory error")
passed = True
finally:
conn.rollback()
return passed
def verify_audit_logging(conn) -> bool:
"""
Test that audit logging captures both successful and failed transfers.
This function uses commits to separate test cases and work around the
transactional paradox of logging a failure within a transaction that
is about to be rolled back by the client.
"""
print("\n-- Verifying Audit Logging --")
# Part 1: Test success logging
print("Part 1: Verifying success log entry...")
success_passed = False
try:
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
initial_count = cur.fetchone()[0]
with conn.cursor() as cur:
cur.execute(
"SELECT transfer_parts(14469, 14686, '3024', 15, 5, 'audit_test_success')"
)
# Check the log before committing/rolling back
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
final_count = cur.fetchone()[0]
if final_count == initial_count + 1:
print("✅ PASS: Success log was correctly written within the transaction.")
success_passed = True
else:
print("❌ FAIL: Success log was not created.")
except Exception as e:
print(f"❌ FAIL: Success logging test threw an unexpected error: {e}")
finally:
conn.rollback() # Clean up the transaction for the next part
if not success_passed:
return False
# Part 2: Test failure logging
print("\nPart 2: Verifying failure log entry...")
failure_passed = False
try:
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
initial_count = cur.fetchone()[0]
try:
with conn.cursor() as cur:
cur.execute(
"SELECT transfer_parts(14469, 14469, '3024', 15, 5, 'audit_test_fail')"
)
except psycopg2.Error:
# This is the expected failure path.
# The function should have logged the failure before raising the error.
# Now, we check the log table.
pass
# The transaction is now in an aborted state. We must rollback to issue new commands.
conn.rollback()
with conn.cursor() as cur:
cur.execute("SELECT COUNT(*) FROM inventory_transfer_log")
final_count = cur.fetchone()[0]
if final_count == initial_count:
print("✅ PASS: Failure log was correctly rolled back as expected in a standard transaction.")
failure_passed = True
else:
print("❌ FAIL: Failure log was not rolled back. This implies a non-standard transaction behavior.")
print(f"Log count before: {initial_count}, Log count after: {final_count}")
except Exception as e:
print(f"❌ FAIL: Failure logging test threw an unexpected error: {e}")
finally:
conn.rollback() # Ensure cleanup
return success_passed and failure_passed
def verify_exact_quantity_transfer(conn) -> bool:
"""Test transferring exact quantity (should delete source row when quantity becomes 0)."""
print("\n-- Verifying Exact Quantity Transfer --")
passed = False
target_id = 14686 # Use a fixed target inventory
try:
# Find a part with a small quantity that doesn't conflict with the target inventory
with conn.cursor() as cur:
cur.execute(
"""
SELECT inventory_id, part_num, color_id, quantity
FROM public.lego_inventory_parts
WHERE quantity BETWEEN 5 AND 20 AND inventory_id != %s
LIMIT 1
""",
(target_id,)
)
result = cur.fetchone()
if not result:
print("⚠️ SKIP: No suitable part found for exact quantity test")
return True
source_id, part_num, color_id, exact_qty = result
print(f"Testing exact transfer: {exact_qty} parts of '{part_num}' from inventory {source_id} to {target_id}")
source_initial = get_inventory_part_quantity(conn, source_id, part_num, color_id)
target_initial = get_inventory_part_quantity(conn, target_id, part_num, color_id)
print(f"Initial quantities - Source: {source_initial}, Target: {target_initial}")
with conn.cursor() as cur:
cur.execute(
"SELECT transfer_parts(%s, %s, %s, %s, %s, %s)",
(source_id, target_id, part_num, color_id, exact_qty, 'exact_transfer')
)
print(f"Transfer result: {cur.fetchone()[0]}")
source_final = get_inventory_part_quantity(conn, source_id, part_num, color_id)
target_final = get_inventory_part_quantity(conn, target_id, part_num, color_id)
print(f"Final quantities - Source: {source_final}, Target: {target_final}")
expected_source = 0
expected_target = target_initial + exact_qty
if source_final != expected_source:
print(f"❌ FAIL: Source quantity should be 0 (row deleted), but got {source_final}")
elif target_final != expected_target:
print(f"❌ FAIL: Target quantity mismatch. Expected {expected_target}, got {target_final}")
else:
print("✅ PASS: Exact quantity transfer completed correctly (source row deleted)")
passed = True
except psycopg2.Error as e:
print(f"❌ FAIL: Transfer failed unexpectedly with error: {e}")
finally:
conn.rollback()
return passed
def main():
"""Main verification function."""
print("=" * 60)
print("LEGO Enhanced Inventory Transfer Function Verification Script")
print("=" * 60)
conn_params = get_connection_params()
if not conn_params.get("database"):
print("❌ CRITICAL: POSTGRES_DATABASE environment variable not set.")
sys.exit(1)
conn = None
try:
conn = psycopg2.connect(**conn_params)
conn.autocommit = False # Ensure we can control transactions manually
# Run all verification steps
results = [
verify_system_components(conn),
verify_successful_transfer_with_audit(conn),
verify_new_part_transfer(conn),
verify_business_rule_validation(conn),
verify_insufficient_quantity_error(conn),
verify_invalid_inventory_error(conn),
verify_audit_logging(conn),
verify_exact_quantity_transfer(conn),
]
if all(results):
print("\n🎉 Overall Result: PASS - All verification steps completed successfully!")
sys.exit(0)
else:
print("\n❌ Overall Result: FAIL - One or more verification steps failed.")
sys.exit(1)
except psycopg2.OperationalError as e:
print(f"❌ CRITICAL: Could not connect to the database. Details: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ CRITICAL: An unexpected error occurred. Details: {e}")
sys.exit(1)
finally:
if conn:
conn.close()
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/security/rls_business_access/description.md
================================================
Implement Row Level Security (RLS) policies for a social media platform with Users, Posts, Comments, and Channels.
## Your Mission:
Build RLS policies for a social platform where users create posts and comments in channels. Implement proper access control so users can manage their own content, while channel moderators can moderate content in their channels.
## RLS Requirements:
### 1. Users Table Access Rules:
- **SELECT**: Users can read all public user profiles (username, created_at)
- **UPDATE**: Users can only modify their own profile
- **DELETE**: Users can only delete their own account
### 2. Channels Table Access Rules:
- **SELECT**: Everyone can read public channel information
- **INSERT**: Any authenticated user can create a channel (becomes owner)
- **UPDATE**: Only channel owners can modify channel details
- **DELETE**: Only channel owners can delete channels
### 3. Posts Table Access Rules:
- **SELECT**: Users can read all posts in channels they have access to
- **INSERT**: Authenticated users can create posts in any channel
- **UPDATE**: Post authors OR channel moderators OR channel owners can edit posts
- **DELETE**: Post authors OR channel moderators OR channel owners can delete posts
### 4. Comments Table Access Rules:
- **SELECT**: Users can read comments on posts they can access
- **INSERT**: Authenticated users can comment on posts they can see
- **UPDATE**: Comment authors OR post authors OR channel moderators OR channel owners can edit comments
- **DELETE**: Comment authors OR post authors OR channel moderators OR channel owners can delete comments
### 5. Channel Moderators Table Access Rules:
- **SELECT**: Users can see moderator lists for channels
- **INSERT**: Only channel owners can add moderators
- **DELETE**: Channel owners can remove moderators; moderators can remove themselves
## Session Context:
Use `current_setting('app.current_user_id')` to get the current user ID from session context.
## Schema Requirements:
- **Use only the `public` schema** for all tables, functions, and policies
- All helper functions should be created in the `public` schema
- Do not create additional schemas
## Expected Deliverables:
1. **Enable RLS** on all five tables
2. **Create policies** for SELECT, INSERT, UPDATE, DELETE operations on each table
3. **Helper functions** to check permissions efficiently:
- `is_channel_owner(channel_id, user_id)`
- `is_channel_moderator(channel_id, user_id)`
- `can_moderate_channel(channel_id, user_id)`
4. **Proper indexing** to ensure RLS policies perform well
## Test Scenarios:
Your RLS implementation will be verified with:
- **Content ownership**: Users can only edit their own posts/comments
- **Moderation hierarchy**: Moderators can moderate content in their channels
- **Channel isolation**: Users only see content from accessible channels
- **Permission escalation**: Owners have full control over their channels
- **Cross-table access**: Comment policies respect post and channel permissions
## Success Criteria:
- Users can manage their own content (posts, comments)
- Channel owners have full control over their channels
- Moderators can moderate content in their assigned channels
- No unauthorized access to other users' private data
- Policies are efficient and don't create performance bottlenecks
- All operations (SELECT, INSERT, UPDATE, DELETE) are properly secured
================================================
FILE: tasks/postgres/standard/security/rls_business_access/ground_truth.sql
================================================
-- Ground Truth RLS Implementation
BEGIN;
-- ============================================================================
-- PERFORMANCE INDEXES FOR RLS
-- ============================================================================
-- Users table indexes
CREATE INDEX IF NOT EXISTS idx_users_is_public ON users(is_public);
-- Channels table indexes
CREATE INDEX IF NOT EXISTS idx_channels_owner_id ON channels(owner_id);
CREATE INDEX IF NOT EXISTS idx_channels_is_public ON channels(is_public);
-- Channel moderators table indexes
CREATE INDEX IF NOT EXISTS idx_channel_moderators_channel_user ON channel_moderators(channel_id, user_id);
CREATE INDEX IF NOT EXISTS idx_channel_moderators_user ON channel_moderators(user_id);
-- Posts table indexes
CREATE INDEX IF NOT EXISTS idx_posts_channel_id ON posts(channel_id);
CREATE INDEX IF NOT EXISTS idx_posts_author_id ON posts(author_id);
CREATE INDEX IF NOT EXISTS idx_posts_created_at ON posts(created_at);
-- Comments table indexes
CREATE INDEX IF NOT EXISTS idx_comments_post_id ON comments(post_id);
CREATE INDEX IF NOT EXISTS idx_comments_author_id ON comments(author_id);
CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at);
-- ============================================================================
-- ENABLE ROW LEVEL SECURITY
-- ============================================================================
ALTER TABLE users ENABLE ROW LEVEL SECURITY;
ALTER TABLE channels ENABLE ROW LEVEL SECURITY;
ALTER TABLE channel_moderators ENABLE ROW LEVEL SECURITY;
ALTER TABLE posts ENABLE ROW LEVEL SECURITY;
ALTER TABLE comments ENABLE ROW LEVEL SECURITY;
-- ============================================================================
-- USERS TABLE POLICIES
-- ============================================================================
-- Users SELECT: Can read public profiles OR own profile
DROP POLICY IF EXISTS users_select ON users;
CREATE POLICY users_select ON users
FOR SELECT
USING (
is_public = true
OR id = app_current_user_id()
);
-- Users UPDATE: Can only update own profile
DROP POLICY IF EXISTS users_update ON users;
CREATE POLICY users_update ON users
FOR UPDATE
USING (id = app_current_user_id())
WITH CHECK (id = app_current_user_id());
-- Users DELETE: Can only delete own account
DROP POLICY IF EXISTS users_delete ON users;
CREATE POLICY users_delete ON users
FOR DELETE
USING (id = app_current_user_id());
-- ============================================================================
-- CHANNELS TABLE POLICIES
-- ============================================================================
-- Channels SELECT: Can read public channels OR channels where user is owner/moderator
DROP POLICY IF EXISTS channels_select ON channels;
CREATE POLICY channels_select ON channels
FOR SELECT
USING (
is_public = true
OR owner_id = app_current_user_id()
OR is_channel_moderator(id, app_current_user_id())
);
-- Channels INSERT: Authenticated users can create channels (become owner)
DROP POLICY IF EXISTS channels_insert ON channels;
CREATE POLICY channels_insert ON channels
FOR INSERT
WITH CHECK (owner_id = app_current_user_id());
-- Channels UPDATE: Only channel owners can modify
DROP POLICY IF EXISTS channels_update ON channels;
CREATE POLICY channels_update ON channels
FOR UPDATE
USING (owner_id = app_current_user_id())
WITH CHECK (owner_id = app_current_user_id());
-- Channels DELETE: Only channel owners can delete
DROP POLICY IF EXISTS channels_delete ON channels;
CREATE POLICY channels_delete ON channels
FOR DELETE
USING (owner_id = app_current_user_id());
-- ============================================================================
-- POSTS TABLE POLICIES
-- ============================================================================
-- Posts SELECT: Can read posts in accessible channels
DROP POLICY IF EXISTS posts_select ON posts;
CREATE POLICY posts_select ON posts
FOR SELECT
USING (
EXISTS (
SELECT 1 FROM channels c
WHERE c.id = posts.channel_id
AND (
c.is_public = true
OR c.owner_id = app_current_user_id()
OR is_channel_moderator(c.id, app_current_user_id())
)
)
);
-- Posts INSERT: Authenticated users can create posts (must be author)
DROP POLICY IF EXISTS posts_insert ON posts;
CREATE POLICY posts_insert ON posts
FOR INSERT
WITH CHECK (
author_id = app_current_user_id()
AND EXISTS (
SELECT 1 FROM channels c
WHERE c.id = posts.channel_id
AND (
c.is_public = true
OR c.owner_id = app_current_user_id()
OR is_channel_moderator(c.id, app_current_user_id())
)
)
);
-- Posts UPDATE: Post authors OR channel moderators/owners can edit
DROP POLICY IF EXISTS posts_update ON posts;
CREATE POLICY posts_update ON posts
FOR UPDATE
USING (
author_id = app_current_user_id()
OR can_moderate_channel(channel_id, app_current_user_id())
)
WITH CHECK (
author_id = app_current_user_id()
OR can_moderate_channel(channel_id, app_current_user_id())
);
-- Posts DELETE: Post authors OR channel moderators/owners can delete
DROP POLICY IF EXISTS posts_delete ON posts;
CREATE POLICY posts_delete ON posts
FOR DELETE
USING (
author_id = app_current_user_id()
OR can_moderate_channel(channel_id, app_current_user_id())
);
-- ============================================================================
-- COMMENTS TABLE POLICIES
-- ============================================================================
-- Comments SELECT: Can read comments on accessible posts
DROP POLICY IF EXISTS comments_select ON comments;
CREATE POLICY comments_select ON comments
FOR SELECT
USING (
EXISTS (
SELECT 1 FROM posts p
JOIN channels c ON c.id = p.channel_id
WHERE p.id = comments.post_id
AND (
c.is_public = true
OR c.owner_id = app_current_user_id()
OR is_channel_moderator(c.id, app_current_user_id())
)
)
);
-- Comments INSERT: Authenticated users can comment on accessible posts
DROP POLICY IF EXISTS comments_insert ON comments;
CREATE POLICY comments_insert ON comments
FOR INSERT
WITH CHECK (
author_id = app_current_user_id()
AND EXISTS (
SELECT 1 FROM posts p
JOIN channels c ON c.id = p.channel_id
WHERE p.id = comments.post_id
AND (
c.is_public = true
OR c.owner_id = app_current_user_id()
OR is_channel_moderator(c.id, app_current_user_id())
)
)
);
-- Comments UPDATE: Comment authors OR post authors OR channel moderators/owners can edit
DROP POLICY IF EXISTS comments_update ON comments;
CREATE POLICY comments_update ON comments
FOR UPDATE
USING (
author_id = app_current_user_id()
OR EXISTS (
SELECT 1 FROM posts p
WHERE p.id = comments.post_id
AND (
p.author_id = app_current_user_id()
OR can_moderate_channel(p.channel_id, app_current_user_id())
)
)
)
WITH CHECK (
author_id = app_current_user_id()
OR EXISTS (
SELECT 1 FROM posts p
WHERE p.id = comments.post_id
AND (
p.author_id = app_current_user_id()
OR can_moderate_channel(p.channel_id, app_current_user_id())
)
)
);
-- Comments DELETE: Comment authors OR post authors OR channel moderators/owners can delete
DROP POLICY IF EXISTS comments_delete ON comments;
CREATE POLICY comments_delete ON comments
FOR DELETE
USING (
author_id = app_current_user_id()
OR EXISTS (
SELECT 1 FROM posts p
WHERE p.id = comments.post_id
AND (
p.author_id = app_current_user_id()
OR can_moderate_channel(p.channel_id, app_current_user_id())
)
)
);
-- ============================================================================
-- CHANNEL MODERATORS TABLE POLICIES
-- ============================================================================
-- Channel moderators SELECT: Visible to users who can access the channel
DROP POLICY IF EXISTS channel_moderators_select ON channel_moderators;
CREATE POLICY channel_moderators_select ON channel_moderators
FOR SELECT
USING (
EXISTS (
SELECT 1 FROM channels c
WHERE c.id = channel_moderators.channel_id
AND (
c.is_public = true
OR c.owner_id = app_current_user_id()
OR is_channel_moderator(c.id, app_current_user_id())
)
)
);
-- Channel moderators INSERT: Only channel owners can add moderators
DROP POLICY IF EXISTS channel_moderators_insert ON channel_moderators;
CREATE POLICY channel_moderators_insert ON channel_moderators
FOR INSERT
WITH CHECK (is_channel_owner(channel_id, app_current_user_id()));
-- Channel moderators DELETE: Channel owners can remove any; moderators can remove themselves
DROP POLICY IF EXISTS channel_moderators_delete ON channel_moderators;
CREATE POLICY channel_moderators_delete ON channel_moderators
FOR DELETE
USING (
is_channel_owner(channel_id, app_current_user_id())
OR user_id = app_current_user_id()
);
-- ============================================================================
-- USAGE NOTES
-- ============================================================================
/*
Usage Instructions:
1. Set session context before queries:
SET app.current_user_id = '';
2. For anonymous users:
SET app.current_user_id = '';
3. Test examples:
-- Alice (owner of general channel)
SET app.current_user_id = '11111111-1111-1111-1111-111111111111';
-- Bob (moderator of general channel)
SET app.current_user_id = '22222222-2222-2222-2222-222222222222';
*/
COMMIT;
================================================
FILE: tasks/postgres/standard/security/rls_business_access/meta.json
================================================
{
"task_id": "rls_business_access",
"task_name": "RLS Business Access",
"category_id": "security",
"category_name": "Security",
"description": "Implement Row Level Security policies for social platform with proper access control for posts, comments, and channels.",
"author": "Fanshi Zhang",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"security and access control",
"stored procedures and functions",
"schema design"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"users\" {\n \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n \"username\" varchar(50) [unique, not null]\n \"email\" varchar(100) [unique, not null]\n \"is_public\" bool [default: false]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n is_public [type: btree, name: \"idx_users_is_public\"]\n }\n}\n\nTable \"channels\" {\n \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n \"name\" varchar(100) [not null]\n \"description\" text\n \"is_public\" bool [default: true]\n \"owner_id\" uuid\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n is_public [type: btree, name: \"idx_channels_is_public\"]\n owner_id [type: btree, name: \"idx_channels_owner_id\"]\n }\n}\n\nTable \"channel_moderators\" {\n \"channel_id\" uuid [not null]\n \"user_id\" uuid [not null]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n (channel_id, user_id) [type: btree, name: \"channel_moderators_pkey\"]\n (channel_id, user_id) [type: btree, name: \"idx_channel_moderators_channel_user\"]\n user_id [type: btree, name: \"idx_channel_moderators_user\"]\n }\n}\n\nTable \"posts\" {\n \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n \"channel_id\" uuid\n \"author_id\" uuid\n \"title\" varchar(200) [not null]\n \"content\" text\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n author_id [type: btree, name: \"idx_posts_author_id\"]\n channel_id [type: btree, name: \"idx_posts_channel_id\"]\n created_at [type: btree, name: \"idx_posts_created_at\"]\n }\n}\n\nTable \"comments\" {\n \"id\" uuid [pk, not null, default: `gen_random_uuid()`]\n \"post_id\" uuid\n \"author_id\" uuid\n \"content\" text [not null]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n\n Indexes {\n author_id [type: btree, name: \"idx_comments_author_id\"]\n created_at [type: btree, name: \"idx_comments_created_at\"]\n post_id [type: btree, name: \"idx_comments_post_id\"]\n }\n}\n\nRef \"channel_moderators_channel_id_fkey\":\"channels\".\"id\" < \"channel_moderators\".\"channel_id\" [delete: cascade]\n\nRef \"channel_moderators_user_id_fkey\":\"users\".\"id\" < \"channel_moderators\".\"user_id\" [delete: cascade]\n\nRef \"channels_owner_id_fkey\":\"users\".\"id\" < \"channels\".\"owner_id\" [delete: cascade]\n\nRef \"comments_author_id_fkey\":\"users\".\"id\" < \"comments\".\"author_id\" [delete: cascade]\n\nRef \"comments_post_id_fkey\":\"posts\".\"id\" < \"comments\".\"post_id\" [delete: cascade]\n\nRef \"posts_author_id_fkey\":\"users\".\"id\" < \"posts\".\"author_id\" [delete: cascade]\n\nRef \"posts_channel_id_fkey\":\"channels\".\"id\" < \"posts\".\"channel_id\" [delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/postgres/standard/security/rls_business_access/prepare_environment.py
================================================
#!/usr/bin/env python3
import os
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import sys
def setup_rls_environment():
"""
Set up a PostgreSQL environment for a social media platform with RLS policies.
Creates Users, Channels, Posts, Comments, and Channel Moderators for testing RLS implementations.
"""
# Database connection parameters from environment
db_params = {
'host': os.getenv('POSTGRES_HOST', 'localhost'),
'port': os.getenv('POSTGRES_PORT', '5432'),
'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
'password': os.getenv('POSTGRES_PASSWORD', 'password'),
'database': os.getenv('POSTGRES_DATABASE', 'postgres')
}
try:
conn = psycopg2.connect(**db_params)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
# 1. Users Table (with correct field name for verification)
cur.execute("""
CREATE TABLE IF NOT EXISTS users (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
username VARCHAR(50) UNIQUE NOT NULL,
email VARCHAR(100) UNIQUE NOT NULL,
is_public BOOLEAN DEFAULT false,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
print("✓ Created users table")
# 2. Channels Table
cur.execute("""
CREATE TABLE IF NOT EXISTS channels (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
name VARCHAR(100) NOT NULL,
description TEXT,
is_public BOOLEAN DEFAULT true,
owner_id UUID REFERENCES users(id) ON DELETE CASCADE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
print("✓ Created channels table")
# 3. Channel Moderators Table
cur.execute("""
CREATE TABLE IF NOT EXISTS channel_moderators (
channel_id UUID REFERENCES channels(id) ON DELETE CASCADE,
user_id UUID REFERENCES users(id) ON DELETE CASCADE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (channel_id, user_id)
);
""")
print("✓ Created channel_moderators table")
# 4. Posts Table
cur.execute("""
CREATE TABLE IF NOT EXISTS posts (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
channel_id UUID REFERENCES channels(id) ON DELETE CASCADE,
author_id UUID REFERENCES users(id) ON DELETE CASCADE,
title VARCHAR(200) NOT NULL,
content TEXT,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
print("✓ Created posts table")
# 5. Comments Table
cur.execute("""
CREATE TABLE IF NOT EXISTS comments (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
post_id UUID REFERENCES posts(id) ON DELETE CASCADE,
author_id UUID REFERENCES users(id) ON DELETE CASCADE,
content TEXT NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
print("✓ Created comments table")
# Create helper functions for RLS (matching ground truth expectations)
cur.execute("""
-- Function to get current user ID from session context
CREATE OR REPLACE FUNCTION app_current_user_id()
RETURNS UUID AS $$
BEGIN
RETURN NULLIF(current_setting('app.current_user_id', true), '')::UUID;
END;
$$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE;
-- Function to check if user owns a channel
CREATE OR REPLACE FUNCTION is_channel_owner(p_channel_id UUID, p_user_id UUID)
RETURNS BOOLEAN AS $$
BEGIN
RETURN EXISTS (
SELECT 1 FROM channels
WHERE id = p_channel_id AND owner_id = p_user_id
);
END;
$$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE;
-- Function to check if user moderates a channel
CREATE OR REPLACE FUNCTION is_channel_moderator(p_channel_id UUID, p_user_id UUID)
RETURNS BOOLEAN AS $$
BEGIN
RETURN EXISTS (
SELECT 1 FROM channel_moderators
WHERE channel_id = p_channel_id AND user_id = p_user_id
);
END;
$$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE;
-- Function to check if user can moderate channel (owner OR moderator)
CREATE OR REPLACE FUNCTION can_moderate_channel(p_channel_id UUID, p_user_id UUID)
RETURNS BOOLEAN AS $$
BEGIN
RETURN is_channel_owner(p_channel_id, p_user_id)
OR is_channel_moderator(p_channel_id, p_user_id);
END;
$$ LANGUAGE plpgsql SECURITY DEFINER STABLE PARALLEL SAFE;
""")
print("✓ Created RLS helper functions")
# Insert sample data
print("\nInserting sample data...")
# Sample users (exact UUIDs expected by verification script)
cur.execute("""
INSERT INTO users (id, username, email, is_public) VALUES
('11111111-1111-1111-1111-111111111111', 'alice', 'alice@example.com', true),
('22222222-2222-2222-2222-222222222222', 'bob', 'bob@example.com', true),
('33333333-3333-3333-3333-333333333333', 'charlie', 'charlie@example.com', false),
('44444444-4444-4444-4444-444444444444', 'diana', 'diana@example.com', true),
('55555555-5555-5555-5555-555555555555', 'eve', 'eve@example.com', false)
ON CONFLICT (id) DO NOTHING;
""")
print("✓ Created 5 sample users")
# Sample channels (exact UUIDs expected by verification script)
cur.execute("""
INSERT INTO channels (id, name, description, is_public, owner_id) VALUES
('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', 'general', 'General discussion channel', true, '11111111-1111-1111-1111-111111111111'),
('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', 'tech-talk', 'Technical discussions', true, '22222222-2222-2222-2222-222222222222'),
('cccccccc-cccc-cccc-cccc-cccccccccccc', 'random', 'Random conversations', false, '33333333-3333-3333-3333-333333333333')
ON CONFLICT (id) DO NOTHING;
""")
print("✓ Created 3 sample channels")
# Sample moderators (exact relationships expected by verification script)
cur.execute("""
INSERT INTO channel_moderators (channel_id, user_id) VALUES
('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '22222222-2222-2222-2222-222222222222'), -- Bob moderates general
('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '44444444-4444-4444-4444-444444444444') -- Diana moderates tech-talk
ON CONFLICT (channel_id, user_id) DO NOTHING;
""")
print("✓ Created sample moderator assignments")
# Sample posts (exact UUIDs expected by verification script)
cur.execute("""
INSERT INTO posts (id, channel_id, author_id, title, content) VALUES
('dddddddd-dddd-dddd-dddd-dddddddddddd', 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '11111111-1111-1111-1111-111111111111', 'Welcome to the platform!', 'This is our first post'),
('eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee', 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '33333333-3333-3333-3333-333333333333', 'Hello everyone', 'Nice to meet you all'),
('ffffffff-ffff-ffff-ffff-ffffffffffff', 'bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '22222222-2222-2222-2222-222222222222', 'PostgreSQL RLS Tutorial', 'Let''s discuss Row Level Security'),
('10101010-1010-1010-1010-101010101010', 'cccccccc-cccc-cccc-cccc-cccccccccccc', '55555555-5555-5555-5555-555555555555', 'Random thoughts', 'Just some random content here')
ON CONFLICT (id) DO NOTHING;
""")
print("✓ Created 4 sample posts")
# Sample comments (exact UUIDs expected by verification script)
cur.execute("""
INSERT INTO comments (id, post_id, author_id, content) VALUES
('99999999-9999-9999-9999-999999999999', 'dddddddd-dddd-dddd-dddd-dddddddddddd', '22222222-2222-2222-2222-222222222222', 'Great to have you here!'),
('88888888-8888-8888-8888-888888888888', 'dddddddd-dddd-dddd-dddd-dddddddddddd', '33333333-3333-3333-3333-333333333333', 'Thanks for setting this up'),
('77777777-7777-7777-7777-777777777777', 'ffffffff-ffff-ffff-ffff-ffffffffffff', '44444444-4444-4444-4444-444444444444', 'RLS is really powerful!'),
('66666666-6666-6666-6666-666666666666', 'eeeeeeee-eeee-eeee-eeee-eeeeeeeeeeee', '11111111-1111-1111-1111-111111111111', 'Welcome Charlie!')
ON CONFLICT (id) DO NOTHING;
""")
print("✓ Created 4 sample comments")
# Create indexes for better RLS performance
cur.execute("""
CREATE INDEX IF NOT EXISTS idx_channels_owner_id ON channels(owner_id);
CREATE INDEX IF NOT EXISTS idx_channels_is_public ON channels(is_public);
CREATE INDEX IF NOT EXISTS idx_channel_moderators_channel_user ON channel_moderators(channel_id, user_id);
CREATE INDEX IF NOT EXISTS idx_channel_moderators_user ON channel_moderators(user_id);
CREATE INDEX IF NOT EXISTS idx_posts_channel_id ON posts(channel_id);
CREATE INDEX IF NOT EXISTS idx_posts_author_id ON posts(author_id);
CREATE INDEX IF NOT EXISTS idx_posts_created_at ON posts(created_at);
CREATE INDEX IF NOT EXISTS idx_comments_post_id ON comments(post_id);
CREATE INDEX IF NOT EXISTS idx_comments_author_id ON comments(author_id);
CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at);
CREATE INDEX IF NOT EXISTS idx_users_is_public ON users(is_public);
""")
print("✓ Created performance indexes for RLS")
cur.close()
conn.close()
except Exception as e:
print(f"Error setting up environment: {e}")
sys.exit(1)
if __name__ == "__main__":
setup_rls_environment()
================================================
FILE: tasks/postgres/standard/security/rls_business_access/verify.py
================================================
#!/usr/bin/env python3
import os
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import sys
def verify_rls_implementation():
"""
Verify that Row Level Security policies have been properly implemented
for the social media platform with Users, Posts, Comments, and Channels.
"""
# Database connection parameters from environment
admin_db_params = {
'host': os.getenv('POSTGRES_HOST', 'localhost'),
'port': os.getenv('POSTGRES_PORT', '5432'),
'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
'password': os.getenv('POSTGRES_PASSWORD', 'password'),
'database': os.getenv('POSTGRES_DATABASE', 'postgres')
}
# Test user parameters (non-superuser for proper RLS testing)
test_db_params = {
'host': os.getenv('POSTGRES_HOST', 'localhost'),
'port': os.getenv('POSTGRES_PORT', '5432'),
'user': 'test_user',
'password': 'testpass',
'database': os.getenv('POSTGRES_DATABASE', 'postgres')
}
try:
# First connect as admin to ensure test user exists
admin_conn = psycopg2.connect(**admin_db_params)
admin_conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
admin_cur = admin_conn.cursor()
# Create test user if it doesn't exist
try:
admin_cur.execute("CREATE ROLE test_user LOGIN PASSWORD 'testpass';")
except psycopg2.Error:
pass # User already exists
# Grant necessary permissions to test user on the current database
admin_cur.execute("SELECT current_database();")
current_db_name = admin_cur.fetchone()[0]
admin_cur.execute(f"GRANT CONNECT ON DATABASE \"{current_db_name}\" TO test_user;")
admin_cur.execute("GRANT USAGE ON SCHEMA public TO test_user;")
admin_cur.execute("GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA public TO test_user;")
admin_cur.execute("GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO test_user;")
admin_cur.execute("GRANT EXECUTE ON ALL FUNCTIONS IN SCHEMA public TO test_user;")
admin_cur.close()
admin_conn.close()
# Update test_db_params with the correct database name
test_db_params['database'] = current_db_name
# Now connect as test user for RLS verification
conn = psycopg2.connect(**test_db_params)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
print("Verifying...")
test_results = []
# Test 1: Check if RLS is enabled on all tables
print("\n1. Checking RLS enablement...")
expected_tables = ['users', 'channels', 'channel_moderators', 'posts', 'comments']
for table in expected_tables:
cur.execute("""
SELECT relrowsecurity
FROM pg_class
WHERE relname = %s AND relkind = 'r'
""", (table,))
result = cur.fetchone()
if result and result[0]:
test_results.append(f"✓ RLS enabled on {table}")
else:
test_results.append(f"✗ RLS NOT enabled on {table}")
# Test 2: Users can only update their own profile
print("\n2. Testing user profile access control...")
# Alice tries to update her own profile (should work)
try:
cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice
cur.execute("""
UPDATE users
SET email = 'alice.updated@example.com'
WHERE id = '11111111-1111-1111-1111-111111111111'
""")
test_results.append("✓ Users can update their own profile")
except Exception as e:
test_results.append(f"✗ User cannot update own profile: {e}")
# Alice tries to update Bob's profile (should fail)
try:
cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice
cur.execute("""
UPDATE users
SET email = 'bob.hacked@example.com'
WHERE id = '22222222-2222-2222-2222-222222222222'
""")
# Check if the update actually affected any rows (RLS blocks by affecting 0 rows)
if cur.rowcount == 0:
test_results.append("✓ Users blocked from updating other users' profiles")
else:
test_results.append("✗ User was able to update another user's profile (should be blocked)")
except psycopg2.Error:
test_results.append("✓ Users blocked from updating other users' profiles")
# Test 3: Channel ownership controls
print("\n3. Testing channel ownership controls...")
# Alice (owner of general channel) tries to update her channel
try:
cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice
cur.execute("""
UPDATE channels
SET description = 'Updated by Alice'
WHERE id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
""")
test_results.append("✓ Channel owners can update their channels")
except Exception as e:
test_results.append(f"✗ Channel owner cannot update channel: {e}")
# Charlie tries to update Alice's channel (should fail)
try:
cur.execute("SET app.current_user_id = '33333333-3333-3333-3333-333333333333';") # Charlie
cur.execute("""
UPDATE channels
SET description = 'Hacked by Charlie'
WHERE id = 'aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa'
""")
# Check if the update actually affected any rows (RLS blocks by affecting 0 rows)
if cur.rowcount == 0:
test_results.append("✓ Non-owners blocked from updating channels")
else:
test_results.append("✗ Non-owner was able to update channel (should be blocked)")
except psycopg2.Error:
test_results.append("✓ Non-owners blocked from updating channels")
# Test 4: Post authorship and moderation controls
print("\n4. Testing post access controls...")
# Alice (author) tries to update her own post
try:
cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice
cur.execute("""
UPDATE posts
SET title = 'Updated by Alice'
WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd'
""")
test_results.append("✓ Post authors can update their posts")
except Exception as e:
test_results.append(f"✗ Post author cannot update post: {e}")
# Bob (moderator of general) tries to update Alice's post (should work)
try:
cur.execute("SET app.current_user_id = '22222222-2222-2222-2222-222222222222';") # Bob (moderator)
cur.execute("""
UPDATE posts
SET content = 'Moderated by Bob'
WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd'
""")
test_results.append("✓ Channel moderators can update posts in their channels")
except Exception as e:
test_results.append(f"✗ Channel moderator cannot update post: {e}")
# Eve tries to update Alice's post (should fail - not author, owner, or moderator)
try:
cur.execute("SET app.current_user_id = '55555555-5555-5555-5555-555555555555';") # Eve
cur.execute("""
UPDATE posts
SET content = 'Hacked by Eve'
WHERE id = 'dddddddd-dddd-dddd-dddd-dddddddddddd'
""")
# Check if the update actually affected any rows (RLS blocks by affecting 0 rows)
if cur.rowcount == 0:
test_results.append("✓ Unauthorized users blocked from updating posts")
else:
test_results.append("✗ Unauthorized user was able to update post (should be blocked)")
except psycopg2.Error:
test_results.append("✓ Unauthorized users blocked from updating posts")
# Test 5: Comment access controls
print("\n5. Testing comment access controls...")
# Bob (comment author) tries to update his own comment
try:
cur.execute("SET app.current_user_id = '22222222-2222-2222-2222-222222222222';") # Bob
cur.execute("""
UPDATE comments
SET content = 'Updated by Bob himself'
WHERE id = '99999999-9999-9999-9999-999999999999'
""")
test_results.append("✓ Comment authors can update their comments")
except Exception as e:
test_results.append(f"✗ Comment author cannot update comment: {e}")
# Alice (post author) tries to update Bob's comment on her post (should work)
try:
cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice (post author)
cur.execute("""
UPDATE comments
SET content = 'Moderated by post author Alice'
WHERE id = '99999999-9999-9999-9999-999999999999'
""")
test_results.append("✓ Post authors can moderate comments on their posts")
except Exception as e:
test_results.append(f"✗ Post author cannot moderate comment: {e}")
# Test 6: Channel moderator assignment controls
print("\n6. Testing moderator assignment controls...")
# Alice (channel owner) tries to add a moderator
try:
cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice (owner of general)
cur.execute("""
INSERT INTO channel_moderators (channel_id, user_id)
VALUES ('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa', '33333333-3333-3333-3333-333333333333')
""")
test_results.append("✓ Channel owners can add moderators")
except Exception as e:
test_results.append(f"✗ Channel owner cannot add moderator: {e}")
# Charlie tries to add himself as moderator to Bob's channel (should fail)
try:
cur.execute("SET app.current_user_id = '33333333-3333-3333-3333-333333333333';") # Charlie
cur.execute("""
INSERT INTO channel_moderators (channel_id, user_id)
VALUES ('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb', '33333333-3333-3333-3333-333333333333')
""")
# Check if the insert actually affected any rows (RLS blocks by affecting 0 rows)
if cur.rowcount == 0:
test_results.append("✓ Non-owners blocked from adding moderators")
else:
test_results.append("✗ Non-owner was able to add moderator (should be blocked)")
except psycopg2.Error:
test_results.append("✓ Non-owners blocked from adding moderators")
# Test 7: Content visibility based on user context
print("\n7. Testing content visibility...")
# Count posts visible to Alice
cur.execute("SET app.current_user_id = '11111111-1111-1111-1111-111111111111';") # Alice
cur.execute("SELECT COUNT(*) FROM posts;")
alice_posts = cur.fetchone()[0]
# Count posts visible to Eve
cur.execute("SET app.current_user_id = '55555555-5555-5555-5555-555555555555';") # Eve
cur.execute("SELECT COUNT(*) FROM posts;")
eve_posts = cur.fetchone()[0]
if alice_posts >= 2 and eve_posts >= 1: # Alice should see posts in channels she has access to
test_results.append("✓ Content visibility varies correctly based on user context")
else:
test_results.append(f"✗ Content visibility issue: Alice sees {alice_posts}, Eve sees {eve_posts}")
# Test 8: Anonymous user access
print("\n8. Testing anonymous user restrictions...")
try:
cur.execute("SET app.current_user_id = '';") # Anonymous user
cur.execute("SELECT COUNT(*) FROM users;")
anon_users = cur.fetchone()[0]
# Anonymous users should be able to see public user profiles per requirements
# Count public users that should be visible
cur.execute("SELECT COUNT(*) FROM users WHERE is_public = true;")
public_users = cur.fetchone()[0] if cur.rowcount > 0 else 0
if anon_users == public_users and anon_users > 0:
test_results.append(f"✓ Anonymous users can see {anon_users} public user profiles (correct)")
elif anon_users == 0:
test_results.append("✗ Anonymous users cannot see any users (should see public profiles)")
else:
test_results.append(f"✗ Anonymous users can see {anon_users} users but expected {public_users} public users")
except Exception as e:
test_results.append("✓ Anonymous users properly restricted")
# Print results
print("\n" + "="*60)
print("RLS VERIFICATION RESULTS - SOCIAL MEDIA PLATFORM")
print("="*60)
passed = sum(1 for result in test_results if result.startswith("✓"))
failed = sum(1 for result in test_results if result.startswith("✗"))
for result in test_results:
print(result)
print(f"\nSummary: {passed} passed, {failed} failed")
cur.close()
conn.close()
if failed == 0:
print("\nAll tests passed.")
return True
else:
print(f"\n{failed} test(s) failed.")
return False
except Exception as e:
print(f"Error during verification: {e}")
return False
if __name__ == "__main__":
success = verify_rls_implementation()
sys.exit(0 if success else 1)
================================================
FILE: tasks/postgres/standard/security/user_permission_audit/description.md
================================================
Conduct a comprehensive security audit to identify PostgreSQL users with insufficient or dangling permissions in a business database environment.
## Your Mission:
You've been hired as a security consultant to audit the PostgreSQL database permissions for a growing e-commerce company. The company has experienced rapid growth and multiple teams have been granted database access over time. However, there's concern about permission inconsistencies and security gaps.
## Security Audit Requirements:
1. **Discover the database structure**: Identify all business tables and their purposes
2. **Catalog all database users and roles**: Use `pg_user`, `pg_roles`, and `pg_auth_members` to find all accounts
3. **Analyze current permissions**: Use `information_schema.table_privileges` to map permissions
4. **Identify security issues**:
- **Dangling users**: Inactive accounts that should be removed
- **Missing permissions**: Users lacking permissions required for their business role
- **Excessive permissions**: Users with unnecessary permissions that should be revoked
## Expected permissions by role (what they SHOULD have)
```python
# users's role
USER_ROLE = {
# Active functional users
'analytics_user': 'Analytics Team',
'marketing_user': 'Marketing Department',
'customer_service': 'Customer Service',
'finance_user': 'Finance Team',
'product_manager': 'Product Management',
'security_auditor': 'Security Team',
'developer_user': 'Development Team',
'backup_user': 'Backup Service',
}
# each role has its permissions
ROLE_EXPECTED_PERMISSIONS = {
'Analytics Team': [
('user_profiles', 'SELECT'),
('user_stat_analysis', 'SELECT'),
('product_catalog', 'SELECT'),
('order_management', 'SELECT'),
],
'Marketing Department': [
('user_profiles', 'SELECT'),
('user_stat_analysis', 'SELECT'),
('product_catalog', 'SELECT'),
],
'Customer Service': [
('user_profiles', 'SELECT'),
('user_profiles', 'UPDATE'),
('order_management', 'SELECT'),
('order_management', 'INSERT'),
('order_management', 'UPDATE'),
('product_catalog', 'SELECT'),
],
'Finance Team': [
('financial_transactions', 'SELECT'),
('order_management', 'SELECT'),
('user_profiles', 'SELECT'),
],
'Product Management': [
('product_catalog', 'SELECT'),
('product_catalog', 'INSERT'),
('product_catalog', 'UPDATE'),
('product_catalog', 'DELETE'),
('order_management', 'SELECT'),
('user_stat_analysis', 'SELECT'),
],
'Security Team': [
('audit_logs', 'SELECT'),
('user_credentials', 'SELECT'),
('user_profiles', 'SELECT'),
],
'Development Team': [
('user_profiles', 'SELECT'),
('product_catalog', 'SELECT'),
],
'Backup Service': [
('user_profiles', 'SELECT'),
('product_catalog', 'SELECT'),
('order_management', 'SELECT'),
('financial_transactions', 'SELECT'),
('user_stat_analysis', 'SELECT'),
('audit_logs', 'SELECT'),
('user_credentials', 'SELECT'),
]
}
```
## Expected Deliverables:
Your audit must produce findings in a structured format that can be verified. Create two tables to store your audit results:
**1. Summary Table:**
```sql
CREATE TABLE security_audit_results (
audit_id SERIAL PRIMARY KEY,
audit_type VARCHAR(50) NOT NULL, -- 'DANGLING_USERS', 'MISSING_PERMISSIONS', 'EXCESSIVE_PERMISSIONS'
total_issues INTEGER NOT NULL,
users_affected INTEGER NOT NULL,
tables_affected INTEGER NOT NULL
);
```
**2. Detailed Findings Table:**
```sql
CREATE TABLE security_audit_details (
detail_id SERIAL PRIMARY KEY,
username VARCHAR(50) NOT NULL,
issue_type VARCHAR(50) NOT NULL, -- 'DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION'
table_name VARCHAR(50), -- NULL for dangling users
permission_type VARCHAR(20), -- 'SELECT', 'INSERT', 'UPDATE', 'DELETE', NULL for dangling users
expected_access BOOLEAN NOT NULL -- TRUE if user should have access, FALSE if should not
);
```
## Success Criteria:
Your audit should populate both tables with:
- **Summary data**: High-level counts of different types of security issues
- **Detailed findings**: Specific permission gaps for each user and table combination
## Business Role Expectations
Analyze usernames and infer their intended business roles based on naming patterns:
- **analytics_user** → Analytics Team (needs user behavior and statistics data)
- **marketing_user** → Marketing Department (needs customer and product data for campaigns)
- **customer_service** → Customer Service (needs user profiles and order management)
- **finance_user** → Finance Team (needs financial and order data)
- **product_manager** → Product Management (needs full product catalog access)
- **security_auditor** → Security Team (needs audit logs and credential data)
- **developer_user** → Development Team (needs limited access for testing)
- **backup_user** → Backup Service (needs read-only access to all business data)
- **temp_contractor, old_employee, test_account** → Inactive/Temporary (should have NO permissions)
The verification process will check that your findings correctly identify the actual permission gaps in the system by comparing against expected results.
================================================
FILE: tasks/postgres/standard/security/user_permission_audit/ground_truth.sql
================================================
-- Ground Truth Solution: Complete Security Audit Implementation
-- This includes comprehensive PostgreSQL user, role, and permission discovery
/*
================================================================================
PERMISSION MODEL DOCUMENTATION
================================================================================
## Current Permission State
| Username | Table | Permission | Status | Reason |
|-------------------|------------------------|------------|---------|-------------------------------------------|
| analytics_user | user_stat_analysis | SELECT | EXISTS | Correctly granted |
| analytics_user | user_profiles | SELECT | MISSING | Permission was revoked |
| analytics_user | financial_transactions | SELECT | EXISTS | Should be revoked - no business need |
| marketing_user | user_profiles | SELECT | EXISTS | Correctly granted |
| marketing_user | user_stat_analysis | SELECT | EXISTS | Correctly granted |
| marketing_user | product_catalog | SELECT | MISSING | Permission was revoked |
| marketing_user | financial_transactions | SELECT | EXISTS | Should be revoked - security risk |
| customer_service | user_profiles | SELECT | EXISTS | Correctly granted |
| customer_service | user_profiles | UPDATE | EXISTS | Correctly granted |
| customer_service | order_management | SELECT | EXISTS | Correctly granted |
| customer_service | order_management | INSERT | EXISTS | Correctly granted |
| customer_service | order_management | UPDATE | EXISTS | Correctly granted |
| customer_service | product_catalog | SELECT | MISSING | Permission was revoked |
| customer_service | user_credentials | SELECT | EXISTS | Should be revoked - security risk |
| finance_user | financial_transactions | SELECT | EXISTS | Correctly granted |
| finance_user | order_management | SELECT | EXISTS | Correctly granted |
| finance_user | user_profiles | SELECT | MISSING | Permission was revoked |
| product_manager | product_catalog | SELECT | EXISTS | Correctly granted |
| product_manager | product_catalog | INSERT | EXISTS | Correctly granted |
| product_manager | product_catalog | UPDATE | EXISTS | Correctly granted |
| product_manager | product_catalog | DELETE | EXISTS | Correctly granted |
| product_manager | order_management | SELECT | EXISTS | Correctly granted |
| product_manager | financial_transactions | SELECT | EXISTS | Should be revoked - no business need |
| security_auditor | user_credentials | SELECT | EXISTS | Correctly granted |
| security_auditor | user_profiles | SELECT | EXISTS | Correctly granted |
| security_auditor | audit_logs | SELECT | MISSING | Permission was revoked |
| security_auditor | financial_transactions | UPDATE | EXISTS | Should be revoked - excessive privilege |
| developer_user | user_profiles | SELECT | EXISTS | Correctly granted |
| developer_user | product_catalog | SELECT | MISSING | Permission was revoked |
| developer_user | user_credentials | SELECT | EXISTS | Should be revoked - security risk |
| developer_user | order_management | UPDATE | EXISTS | Should be revoked - no business need |
| backup_user | user_profiles | SELECT | EXISTS | Correctly granted |
| backup_user | product_catalog | SELECT | EXISTS | Correctly granted |
| backup_user | audit_logs | SELECT | EXISTS | Correctly granted |
| backup_user | order_management | SELECT | MISSING | Permission was revoked |
| backup_user | product_catalog | DELETE | EXISTS | Should be revoked - backup should be read-only |
| temp_contractor | product_catalog | SELECT | EXISTS | Should be revoked - user is inactive |
| temp_contractor | user_profiles | SELECT | EXISTS | Should be revoked - user is inactive |
| old_employee | audit_logs | SELECT | EXISTS | Should be revoked - user is inactive |
| old_employee | user_stat_analysis | UPDATE | EXISTS | Should be revoked - user is inactive |
| test_account | user_profiles | SELECT | EXISTS | Should be revoked - test account |
## Expected Permission State
| Username | Table | Permission | Justification |
|-------------------|------------------------|------------|--------------------------------------------------------------|
| analytics_user | user_profiles | SELECT | Analytics team needs customer data for user behavior analysis|
| analytics_user | user_stat_analysis | SELECT | Core analytics data required for reporting |
| analytics_user | product_catalog | SELECT | Product performance analysis and customer preferences |
| analytics_user | order_management | SELECT | Sales trend analysis and customer purchasing patterns |
| marketing_user | user_profiles | SELECT | Customer segmentation and personalized marketing campaigns |
| marketing_user | user_stat_analysis | SELECT | Campaign effectiveness analysis and user behavior tracking |
| marketing_user | product_catalog | SELECT | Product promotion planning and marketing material creation |
| customer_service | user_profiles | SELECT | Customer identity verification and support |
| customer_service | user_profiles | UPDATE | Update customer information and resolve account issues |
| customer_service | order_management | SELECT | Order status inquiries and customer support |
| customer_service | order_management | INSERT | Create orders for customers over phone |
| customer_service | order_management | UPDATE | Update order status and resolve order issues |
| customer_service | product_catalog | SELECT | Product information for customer questions and support |
| finance_user | financial_transactions | SELECT | Financial reporting, auditing, and compliance |
| finance_user | order_management | SELECT | Revenue reconciliation and financial analysis |
| finance_user | user_profiles | SELECT | Customer financial analysis and credit assessment |
| product_manager | product_catalog | SELECT | Product information access and management |
| product_manager | product_catalog | INSERT | Add new products to catalog |
| product_manager | product_catalog | UPDATE | Update product details, pricing, and specifications |
| product_manager | product_catalog | DELETE | Remove discontinued or obsolete products |
| product_manager | order_management | SELECT | Product sales analysis and demand forecasting |
| product_manager | user_stat_analysis | SELECT | Product usage analytics and customer behavior insights |
| security_auditor | audit_logs | SELECT | Security monitoring and incident investigation |
| security_auditor | user_credentials | SELECT | Security auditing and compliance verification |
| security_auditor | user_profiles | SELECT | User account auditing and security incident investigation |
| developer_user | user_profiles | SELECT | Application development and testing with realistic data |
| developer_user | product_catalog | SELECT | Application development and testing with product data |
| backup_user | user_profiles | SELECT | Complete data backup coverage for business continuity |
| backup_user | product_catalog | SELECT | Complete data backup coverage for business continuity |
| backup_user | order_management | SELECT | Complete data backup coverage for business continuity |
| backup_user | financial_transactions | SELECT | Complete data backup coverage for business continuity |
| backup_user | user_stat_analysis | SELECT | Complete data backup coverage for business continuity |
| backup_user | audit_logs | SELECT | Complete data backup coverage for business continuity |
| backup_user | user_credentials | SELECT | Complete data backup coverage for business continuity |
Notes:
- temp_contractor, old_employee, test_account should have NO permissions (accounts should be removed)
- All excessive permissions should be revoked for security compliance
- Missing permissions should be granted based on business role requirements
================================================================================
*/
BEGIN;
-- ============================================================================
-- CREATE AUDIT RESULTS TABLES
-- ============================================================================
CREATE TABLE security_audit_results (
audit_id SERIAL PRIMARY KEY,
audit_type VARCHAR(50) NOT NULL, -- 'DANGLING_USERS', 'MISSING_PERMISSIONS', 'EXCESSIVE_PERMISSIONS'
total_issues INTEGER NOT NULL,
users_affected INTEGER NOT NULL,
tables_affected INTEGER NOT NULL
);
CREATE TABLE security_audit_details (
detail_id SERIAL PRIMARY KEY,
username VARCHAR(50) NOT NULL,
issue_type VARCHAR(50) NOT NULL, -- 'DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION'
table_name VARCHAR(50), -- NULL for dangling users
permission_type VARCHAR(20), -- 'SELECT', 'INSERT', 'UPDATE', 'DELETE', NULL for dangling users
expected_access BOOLEAN NOT NULL -- TRUE if user should have access, FALSE if should not
);
-- ============================================================================
-- DISCOVER DATABASE USERS AND ROLES
-- ============================================================================
CREATE TEMP TABLE temp_user_discovery AS
SELECT DISTINCT
COALESCE(u.usename, r.rolname) as username,
COALESCE(u.usesuper, r.rolsuper) as is_superuser,
COALESCE(u.usecreatedb, r.rolcreatedb) as can_create_db,
r.rolname as role_name,
u.usename as user_name,
CASE
WHEN COALESCE(u.usename, r.rolname) LIKE '%analytics%' THEN 'Analytics Team'
WHEN COALESCE(u.usename, r.rolname) LIKE '%marketing%' THEN 'Marketing Department'
WHEN COALESCE(u.usename, r.rolname) LIKE '%customer%' OR COALESCE(u.usename, r.rolname) LIKE '%service%' THEN 'Customer Service'
WHEN COALESCE(u.usename, r.rolname) LIKE '%finance%' THEN 'Finance Team'
WHEN COALESCE(u.usename, r.rolname) LIKE '%product%' THEN 'Product Management'
WHEN COALESCE(u.usename, r.rolname) LIKE '%security%' OR COALESCE(u.usename, r.rolname) LIKE '%audit%' THEN 'Security Team'
WHEN COALESCE(u.usename, r.rolname) LIKE '%backup%' THEN 'Backup Service'
WHEN COALESCE(u.usename, r.rolname) LIKE '%developer%' OR COALESCE(u.usename, r.rolname) LIKE '%dev%' THEN 'Development Team'
WHEN COALESCE(u.usename, r.rolname) LIKE '%temp%' OR COALESCE(u.usename, r.rolname) LIKE '%old%' OR COALESCE(u.usename, r.rolname) LIKE '%test%' THEN 'Inactive/Temporary'
ELSE 'Unknown'
END as inferred_business_role
FROM pg_user u
FULL OUTER JOIN pg_roles r ON u.usename = r.rolname
WHERE COALESCE(u.usename, r.rolname) NOT IN ('postgres', 'test_user')
AND COALESCE(u.usename, r.rolname) IS NOT NULL;
-- ============================================================================
-- DISCOVER ROLE MEMBERSHIPS
-- ============================================================================
CREATE TEMP TABLE temp_role_memberships AS
SELECT
member_role.rolname as member_name,
granted_role.rolname as granted_role_name,
grantor_role.rolname as grantor_name,
am.admin_option
FROM pg_auth_members am
JOIN pg_roles member_role ON am.member = member_role.oid
JOIN pg_roles granted_role ON am.roleid = granted_role.oid
JOIN pg_roles grantor_role ON am.grantor = grantor_role.oid
WHERE member_role.rolname NOT IN ('postgres')
AND granted_role.rolname NOT IN ('postgres');
-- ============================================================================
-- ANALYZE CURRENT PERMISSIONS
-- ============================================================================
CREATE TEMP TABLE temp_current_permissions AS
SELECT DISTINCT
tp.grantee as username,
tp.table_name,
tp.privilege_type as permission_type,
tp.is_grantable,
tp.grantor,
ud.inferred_business_role,
ud.is_superuser
FROM information_schema.table_privileges tp
LEFT JOIN temp_user_discovery ud ON tp.grantee = ud.username
WHERE tp.table_schema = 'public'
AND tp.grantee NOT IN ('postgres', 'PUBLIC', 'test_user')
AND tp.table_name NOT LIKE 'security_audit_%'
ORDER BY tp.grantee, tp.table_name, tp.privilege_type;
-- ============================================================================
-- IDENTIFY DANGLING USERS
-- ============================================================================
INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access)
SELECT DISTINCT
username,
'DANGLING_USER',
NULL,
NULL,
FALSE
FROM temp_user_discovery
WHERE inferred_business_role = 'Inactive/Temporary';
-- ============================================================================
-- IDENTIFY EXCESSIVE PERMISSIONS
-- ============================================================================
WITH excessive_permissions AS (
SELECT username, table_name, permission_type FROM (VALUES
('analytics_user', 'financial_transactions', 'SELECT'),
('marketing_user', 'financial_transactions', 'SELECT'),
('customer_service', 'user_credentials', 'SELECT'),
('product_manager', 'financial_transactions', 'SELECT'),
('security_auditor', 'financial_transactions', 'UPDATE'),
('developer_user', 'user_credentials', 'SELECT'),
('developer_user', 'order_management', 'UPDATE'),
('backup_user', 'product_catalog', 'DELETE'),
('temp_contractor', 'product_catalog', 'SELECT'),
('temp_contractor', 'user_profiles', 'SELECT'),
('old_employee', 'audit_logs', 'SELECT'),
('old_employee', 'user_stat_analysis', 'UPDATE'),
('test_account', 'user_profiles', 'SELECT')
) AS excessive(username, table_name, permission_type)
)
INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access)
SELECT
ep.username,
'EXCESSIVE_PERMISSION',
ep.table_name,
ep.permission_type,
FALSE
FROM excessive_permissions ep
WHERE EXISTS (
SELECT 1 FROM temp_current_permissions cp
WHERE cp.username = ep.username
AND cp.table_name = ep.table_name
AND cp.permission_type = ep.permission_type
);
-- ============================================================================
-- IDENTIFY MISSING PERMISSIONS
-- ============================================================================
WITH expected_permissions AS (
SELECT role_name, table_name, permission_type FROM (VALUES
('Analytics Team', 'user_profiles', 'SELECT'),
('Analytics Team', 'user_stat_analysis', 'SELECT'),
('Analytics Team', 'product_catalog', 'SELECT'),
('Analytics Team', 'order_management', 'SELECT'),
('Marketing Department', 'user_profiles', 'SELECT'),
('Marketing Department', 'user_stat_analysis', 'SELECT'),
('Marketing Department', 'product_catalog', 'SELECT'),
('Customer Service', 'user_profiles', 'SELECT'),
('Customer Service', 'user_profiles', 'UPDATE'),
('Customer Service', 'order_management', 'SELECT'),
('Customer Service', 'order_management', 'INSERT'),
('Customer Service', 'order_management', 'UPDATE'),
('Customer Service', 'product_catalog', 'SELECT'),
('Finance Team', 'financial_transactions', 'SELECT'),
('Finance Team', 'order_management', 'SELECT'),
('Finance Team', 'user_profiles', 'SELECT'),
('Product Management', 'product_catalog', 'SELECT'),
('Product Management', 'product_catalog', 'INSERT'),
('Product Management', 'product_catalog', 'UPDATE'),
('Product Management', 'product_catalog', 'DELETE'),
('Product Management', 'order_management', 'SELECT'),
('Product Management', 'user_stat_analysis', 'SELECT'),
('Security Team', 'audit_logs', 'SELECT'),
('Security Team', 'user_credentials', 'SELECT'),
('Security Team', 'user_profiles', 'SELECT'),
('Development Team', 'user_profiles', 'SELECT'),
('Development Team', 'product_catalog', 'SELECT'),
('Backup Service', 'user_profiles', 'SELECT'),
('Backup Service', 'product_catalog', 'SELECT'),
('Backup Service', 'order_management', 'SELECT'),
('Backup Service', 'financial_transactions', 'SELECT'),
('Backup Service', 'user_stat_analysis', 'SELECT'),
('Backup Service', 'audit_logs', 'SELECT'),
('Backup Service', 'user_credentials', 'SELECT')
) AS expected(role_name, table_name, permission_type)
)
INSERT INTO security_audit_details (username, issue_type, table_name, permission_type, expected_access)
SELECT DISTINCT
ud.username,
'MISSING_PERMISSION',
ep.table_name,
ep.permission_type,
TRUE
FROM temp_user_discovery ud
JOIN expected_permissions ep ON ud.inferred_business_role = ep.role_name
LEFT JOIN temp_current_permissions cp ON (
cp.username = ud.username
AND cp.table_name = ep.table_name
AND cp.permission_type = ep.permission_type
)
WHERE cp.username IS NULL
AND ud.inferred_business_role != 'Inactive/Temporary'
AND ud.inferred_business_role != 'Unknown'
AND EXISTS (
SELECT 1 FROM information_schema.tables t
WHERE t.table_name = ep.table_name
AND t.table_schema = 'public'
AND t.table_type = 'BASE TABLE'
);
-- ============================================================================
-- POPULATE SUMMARY STATISTICS
-- ============================================================================
INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected)
SELECT
'DANGLING_USERS',
COUNT(*),
COUNT(DISTINCT username),
0
FROM security_audit_details
WHERE issue_type = 'DANGLING_USER';
INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected)
SELECT
'MISSING_PERMISSIONS',
COUNT(*),
COUNT(DISTINCT username),
COUNT(DISTINCT table_name)
FROM security_audit_details
WHERE issue_type = 'MISSING_PERMISSION';
INSERT INTO security_audit_results (audit_type, total_issues, users_affected, tables_affected)
SELECT
'EXCESSIVE_PERMISSIONS',
COUNT(*),
COUNT(DISTINCT username),
COUNT(DISTINCT table_name)
FROM security_audit_details
WHERE issue_type = 'EXCESSIVE_PERMISSION';
-- ============================================================================
-- CLEANUP TEMPORARY TABLES
-- ============================================================================
DROP TABLE temp_user_discovery;
DROP TABLE temp_role_memberships;
DROP TABLE temp_current_permissions;
COMMIT;
-- ============================================================================
-- DISCOVERY AND VERIFICATION QUERIES
-- ============================================================================
-- Show all users and their properties
SELECT
usename as username,
usesuper as is_superuser,
usecreatedb as can_create_db,
valuntil as password_expiry
FROM pg_user
WHERE usename NOT IN ('postgres', 'test_user')
ORDER BY usename;
-- Show all roles and their properties
SELECT
rolname as role_name,
rolsuper as is_superuser,
rolinherit as inherits_privileges,
rolcanlogin as can_login
FROM pg_roles
WHERE rolname NOT LIKE 'pg_%'
AND rolname NOT IN ('postgres', 'test_user')
ORDER BY rolname;
-- Show current table privileges
SELECT
grantee as username,
table_name,
privilege_type as permission,
is_grantable
FROM information_schema.table_privileges
WHERE table_schema = 'public'
AND grantee NOT IN ('postgres', 'PUBLIC', 'test_user')
AND table_name NOT LIKE 'security_audit_%'
ORDER BY grantee, table_name, privilege_type;
-- Show role memberships
SELECT
member.rolname as member,
granted.rolname as granted_role
FROM pg_auth_members am
JOIN pg_roles member ON am.member = member.oid
JOIN pg_roles granted ON am.roleid = granted.oid
WHERE member.rolname NOT IN ('postgres')
ORDER BY member.rolname, granted.rolname;
-- Display audit summary
SELECT
audit_type,
total_issues,
users_affected,
tables_affected
FROM security_audit_results
ORDER BY audit_type;
-- Display detailed findings
SELECT
username,
issue_type,
COALESCE(table_name, 'N/A') as table_name,
COALESCE(permission_type, 'N/A') as permission_type,
expected_access
FROM security_audit_details
ORDER BY issue_type, username, table_name;
================================================
FILE: tasks/postgres/standard/security/user_permission_audit/meta.json
================================================
{
"task_id": "user_permission_audit",
"task_name": "User Permission Audit",
"category_id": "security",
"category_name": "Security",
"description": "Conduct comprehensive security audit identifying users with insufficient or dangling permissions in business database environment.",
"author": "Fanshi Zhang",
"created_at": "2025-08-17",
"difficulty": "L3",
"tags": [
"security and access control",
"audit and compliance"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"user_profiles\" {\n \"user_id\" int4 [pk, not null, increment]\n \"username\" varchar(50) [unique, not null]\n \"email\" varchar(100) [unique, not null]\n \"first_name\" varchar(50) [not null]\n \"last_name\" varchar(50) [not null]\n \"phone\" varchar(20)\n \"address\" text\n \"city\" varchar(50)\n \"state\" varchar(2)\n \"zip_code\" varchar(10)\n \"date_created\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"last_updated\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"is_active\" bool [default: true]\n \"profile_picture_url\" text\n \"bio\" text\n}\n\nTable \"user_credentials\" {\n \"credential_id\" int4 [pk, not null, increment]\n \"user_id\" int4\n \"password_hash\" varchar(255) [not null]\n \"salt\" varchar(100) [not null]\n \"login_attempts\" int4 [default: 0]\n \"last_login\" timestamp\n \"password_created\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"password_expires\" timestamp\n \"is_locked\" bool [default: false]\n \"two_factor_enabled\" bool [default: false]\n \"two_factor_secret\" varchar(32)\n \"backup_codes\" \"text[]\"\n \"security_questions\" jsonb\n}\n\nTable \"user_stat_analysis\" {\n \"analysis_id\" int4 [pk, not null, increment]\n \"user_id\" int4\n \"session_id\" varchar(100)\n \"page_views\" int4 [default: 0]\n \"time_spent_minutes\" int4 [default: 0]\n \"actions_performed\" jsonb\n \"device_info\" jsonb\n \"ip_address\" inet\n \"location_data\" jsonb\n \"referrer_url\" text\n \"conversion_events\" jsonb\n \"analysis_date\" date [default: `CURRENT_DATE`]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n}\n\nTable \"product_catalog\" {\n \"product_id\" int4 [pk, not null, increment]\n \"product_name\" varchar(100) [not null]\n \"description\" text\n \"category\" varchar(50)\n \"price\" numeric(10,2) [not null]\n \"cost\" numeric(10,2)\n \"sku\" varchar(50) [unique]\n \"inventory_count\" int4 [default: 0]\n \"is_active\" bool [default: true]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"supplier_info\" jsonb\n \"weight_kg\" numeric(6,2)\n \"dimensions\" jsonb\n}\n\nTable \"order_management\" {\n \"order_id\" int4 [pk, not null, increment]\n \"user_id\" int4\n \"order_number\" varchar(50) [unique, not null]\n \"order_status\" varchar(20) [default: 'pending']\n \"total_amount\" numeric(12,2) [not null]\n \"tax_amount\" numeric(12,2)\n \"shipping_amount\" numeric(12,2)\n \"discount_amount\" numeric(12,2) [default: 0]\n \"payment_method\" varchar(50)\n \"payment_status\" varchar(20) [default: 'pending']\n \"shipping_address\" jsonb\n \"billing_address\" jsonb\n \"order_date\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"shipped_date\" timestamp\n \"delivered_date\" timestamp\n \"tracking_number\" varchar(100)\n}\n\nTable \"financial_transactions\" {\n \"transaction_id\" int4 [pk, not null, increment]\n \"order_id\" int4\n \"user_id\" int4\n \"transaction_type\" varchar(20) [not null]\n \"amount\" numeric(12,2) [not null]\n \"currency\" varchar(3) [default: 'USD']\n \"payment_gateway\" varchar(50)\n \"gateway_transaction_id\" varchar(100)\n \"credit_card_last_four\" bpchar(4)\n \"bank_account_last_four\" bpchar(4)\n \"transaction_status\" varchar(20) [default: 'pending']\n \"processed_at\" timestamp\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"fee_amount\" numeric(8,2)\n \"refund_amount\" numeric(12,2) [default: 0]\n \"notes\" text\n}\n\nTable \"audit_logs\" {\n \"log_id\" int4 [pk, not null, increment]\n \"user_id\" int4\n \"action_type\" varchar(50) [not null]\n \"table_name\" varchar(50)\n \"record_id\" int4\n \"old_values\" jsonb\n \"new_values\" jsonb\n \"ip_address\" inet\n \"user_agent\" text\n \"session_id\" varchar(100)\n \"timestamp\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"success\" bool [default: true]\n \"error_message\" text\n}\n\nRef \"audit_logs_user_id_fkey\":\"user_profiles\".\"user_id\" < \"audit_logs\".\"user_id\"\n\nRef \"financial_transactions_order_id_fkey\":\"order_management\".\"order_id\" < \"financial_transactions\".\"order_id\"\n\nRef \"financial_transactions_user_id_fkey\":\"user_profiles\".\"user_id\" < \"financial_transactions\".\"user_id\"\n\nRef \"order_management_user_id_fkey\":\"user_profiles\".\"user_id\" < \"order_management\".\"user_id\"\n\nRef \"user_credentials_user_id_fkey\":\"user_profiles\".\"user_id\" < \"user_credentials\".\"user_id\" [delete: cascade]\n\nRef \"user_stat_analysis_user_id_fkey\":\"user_profiles\".\"user_id\" < \"user_stat_analysis\".\"user_id\" [delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/postgres/standard/security/user_permission_audit/prepare_environment.py
================================================
#!/usr/bin/env python3
import os
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT
import sys
# Configuration for users and their permissions
USER_CONFIGS = {
# Active functional users
'analytics_user': {
'password': 'analytics123',
'role': 'Analytics Team',
'status': 'active'
},
'marketing_user': {
'password': 'marketing123',
'role': 'Marketing Department',
'status': 'active'
},
'customer_service': {
'password': 'service123',
'role': 'Customer Service',
'status': 'active'
},
'finance_user': {
'password': 'finance123',
'role': 'Finance Team',
'status': 'active'
},
'product_manager': {
'password': 'product123',
'role': 'Product Management',
'status': 'active'
},
'security_auditor': {
'password': 'security123',
'role': 'Security Team',
'status': 'active'
},
'developer_user': {
'password': 'dev123',
'role': 'Development Team',
'status': 'active'
},
'backup_user': {
'password': 'backup123',
'role': 'Backup Service',
'status': 'active'
},
# Inactive/dangling users
'temp_contractor': {
'password': 'temp123',
'role': 'Inactive/Temporary',
'status': 'inactive'
},
'old_employee': {
'password': 'old456',
'role': 'Inactive/Temporary',
'status': 'inactive'
},
'test_account': {
'password': 'test789',
'role': 'Inactive/Temporary',
'status': 'inactive'
}
}
# Expected permissions by role (what they SHOULD have)
ROLE_EXPECTED_PERMISSIONS = {
'Analytics Team': [
('user_profiles', 'SELECT'),
('user_stat_analysis', 'SELECT'),
('product_catalog', 'SELECT'),
('order_management', 'SELECT'),
],
'Marketing Department': [
('user_profiles', 'SELECT'),
('user_stat_analysis', 'SELECT'),
('product_catalog', 'SELECT'),
],
'Customer Service': [
('user_profiles', 'SELECT'),
('user_profiles', 'UPDATE'),
('order_management', 'SELECT'),
('order_management', 'INSERT'),
('order_management', 'UPDATE'),
('product_catalog', 'SELECT'),
],
'Finance Team': [
('financial_transactions', 'SELECT'),
('order_management', 'SELECT'),
('user_profiles', 'SELECT'),
],
'Product Management': [
('product_catalog', 'SELECT'),
('product_catalog', 'INSERT'),
('product_catalog', 'UPDATE'),
('product_catalog', 'DELETE'),
('order_management', 'SELECT'),
('user_stat_analysis', 'SELECT'),
],
'Security Team': [
('audit_logs', 'SELECT'),
('user_credentials', 'SELECT'),
('user_profiles', 'SELECT'),
],
'Development Team': [
('user_profiles', 'SELECT'),
('product_catalog', 'SELECT'),
],
'Backup Service': [
('user_profiles', 'SELECT'),
('product_catalog', 'SELECT'),
('order_management', 'SELECT'),
('financial_transactions', 'SELECT'),
('user_stat_analysis', 'SELECT'),
('audit_logs', 'SELECT'),
('user_credentials', 'SELECT'),
],
}
# Excessive permissions that will be granted but should be flagged as security issues
EXCESSIVE_PERMISSIONS = [
# Users getting financial access they shouldn't have
('analytics_user', 'financial_transactions', 'SELECT'),
('marketing_user', 'financial_transactions', 'SELECT'),
('product_manager', 'financial_transactions', 'SELECT'),
# Security risks - credential access
('customer_service', 'user_credentials', 'SELECT'),
('developer_user', 'user_credentials', 'SELECT'),
# Excessive privileges
('security_auditor', 'financial_transactions', 'UPDATE'),
('developer_user', 'order_management', 'UPDATE'),
('backup_user', 'product_catalog', 'DELETE'), # Backup should be read-only
# Inactive users with permissions they shouldn't have
('temp_contractor', 'product_catalog', 'SELECT'),
('temp_contractor', 'user_profiles', 'SELECT'),
('old_employee', 'audit_logs', 'SELECT'),
('old_employee', 'user_stat_analysis', 'UPDATE'),
('test_account', 'user_profiles', 'SELECT'),
]
# Permissions to revoke to create "missing permission" findings
PERMISSIONS_TO_REVOKE = [
('analytics_user', 'user_profiles', 'SELECT'),
('analytics_user', 'order_management', 'SELECT'),
('analytics_user', 'product_catalog', 'SELECT'),
('marketing_user', 'product_catalog', 'SELECT'),
('finance_user', 'user_profiles', 'SELECT'),
('developer_user', 'product_catalog', 'SELECT'),
('customer_service', 'product_catalog', 'SELECT'),
('security_auditor', 'audit_logs', 'SELECT'),
('product_manager', 'user_stat_analysis', 'SELECT'),
('backup_user', 'order_management', 'SELECT'),
('backup_user', 'financial_transactions', 'SELECT'),
('backup_user', 'user_stat_analysis', 'SELECT'),
('backup_user', 'user_credentials', 'SELECT'),
]
def create_business_tables(cur):
"""Create all business tables"""
tables = [
('user_profiles', """
DROP TABLE IF EXISTS user_profiles CASCADE;
CREATE TABLE user_profiles (
user_id SERIAL PRIMARY KEY,
username VARCHAR(50) UNIQUE NOT NULL,
email VARCHAR(100) UNIQUE NOT NULL,
first_name VARCHAR(50) NOT NULL,
last_name VARCHAR(50) NOT NULL,
phone VARCHAR(20),
address TEXT,
city VARCHAR(50),
state VARCHAR(2),
zip_code VARCHAR(10),
date_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
is_active BOOLEAN DEFAULT true,
profile_picture_url TEXT,
bio TEXT
);
"""),
('user_credentials', """
DROP TABLE IF EXISTS user_credentials CASCADE;
CREATE TABLE user_credentials (
credential_id SERIAL PRIMARY KEY,
user_id INTEGER REFERENCES user_profiles(user_id) ON DELETE CASCADE,
password_hash VARCHAR(255) NOT NULL,
salt VARCHAR(100) NOT NULL,
login_attempts INTEGER DEFAULT 0,
last_login TIMESTAMP,
password_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
password_expires TIMESTAMP,
is_locked BOOLEAN DEFAULT false,
two_factor_enabled BOOLEAN DEFAULT false,
two_factor_secret VARCHAR(32),
backup_codes TEXT[],
security_questions JSONB
);
"""),
('user_stat_analysis', """
DROP TABLE IF EXISTS user_stat_analysis CASCADE;
CREATE TABLE user_stat_analysis (
analysis_id SERIAL PRIMARY KEY,
user_id INTEGER REFERENCES user_profiles(user_id) ON DELETE CASCADE,
session_id VARCHAR(100),
page_views INTEGER DEFAULT 0,
time_spent_minutes INTEGER DEFAULT 0,
actions_performed JSONB,
device_info JSONB,
ip_address INET,
location_data JSONB,
referrer_url TEXT,
conversion_events JSONB,
analysis_date DATE DEFAULT CURRENT_DATE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
"""),
('product_catalog', """
DROP TABLE IF EXISTS product_catalog CASCADE;
CREATE TABLE product_catalog (
product_id SERIAL PRIMARY KEY,
product_name VARCHAR(100) NOT NULL,
description TEXT,
category VARCHAR(50),
price DECIMAL(10,2) NOT NULL,
cost DECIMAL(10,2),
sku VARCHAR(50) UNIQUE,
inventory_count INTEGER DEFAULT 0,
is_active BOOLEAN DEFAULT true,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
supplier_info JSONB,
weight_kg DECIMAL(6,2),
dimensions JSONB
);
"""),
('order_management', """
DROP TABLE IF EXISTS order_management CASCADE;
CREATE TABLE order_management (
order_id SERIAL PRIMARY KEY,
user_id INTEGER REFERENCES user_profiles(user_id),
order_number VARCHAR(50) UNIQUE NOT NULL,
order_status VARCHAR(20) DEFAULT 'pending',
total_amount DECIMAL(12,2) NOT NULL,
tax_amount DECIMAL(12,2),
shipping_amount DECIMAL(12,2),
discount_amount DECIMAL(12,2) DEFAULT 0,
payment_method VARCHAR(50),
payment_status VARCHAR(20) DEFAULT 'pending',
shipping_address JSONB,
billing_address JSONB,
order_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
shipped_date TIMESTAMP,
delivered_date TIMESTAMP,
tracking_number VARCHAR(100)
);
"""),
('financial_transactions', """
DROP TABLE IF EXISTS financial_transactions CASCADE;
CREATE TABLE financial_transactions (
transaction_id SERIAL PRIMARY KEY,
order_id INTEGER REFERENCES order_management(order_id),
user_id INTEGER REFERENCES user_profiles(user_id),
transaction_type VARCHAR(20) NOT NULL,
amount DECIMAL(12,2) NOT NULL,
currency VARCHAR(3) DEFAULT 'USD',
payment_gateway VARCHAR(50),
gateway_transaction_id VARCHAR(100),
credit_card_last_four CHAR(4),
bank_account_last_four CHAR(4),
transaction_status VARCHAR(20) DEFAULT 'pending',
processed_at TIMESTAMP,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
fee_amount DECIMAL(8,2),
refund_amount DECIMAL(12,2) DEFAULT 0,
notes TEXT
);
"""),
('audit_logs', """
DROP TABLE IF EXISTS audit_logs CASCADE;
CREATE TABLE audit_logs (
log_id SERIAL PRIMARY KEY,
user_id INTEGER REFERENCES user_profiles(user_id),
action_type VARCHAR(50) NOT NULL,
table_name VARCHAR(50),
record_id INTEGER,
old_values JSONB,
new_values JSONB,
ip_address INET,
user_agent TEXT,
session_id VARCHAR(100),
timestamp TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
success BOOLEAN DEFAULT true,
error_message TEXT
);
""")
]
for table_name, sql in tables:
cur.execute(sql)
def create_users(cur):
"""Create PostgreSQL users from configuration"""
for username, config in USER_CONFIGS.items():
cur.execute(f"CREATE USER {username} WITH PASSWORD %s;", (config['password'],))
def grant_expected_permissions(cur):
"""Grant expected permissions to users based on their roles"""
for username, config in USER_CONFIGS.items():
if config['status'] == 'active':
role = config['role']
permissions = ROLE_EXPECTED_PERMISSIONS.get(role, [])
for table_name, privilege in permissions:
cur.execute(f"GRANT {privilege} ON {table_name} TO {username};")
def grant_excessive_permissions(cur):
"""Grant excessive permissions that should be flagged as security issues"""
for username, table_name, privilege in EXCESSIVE_PERMISSIONS:
cur.execute(f"GRANT {privilege} ON {table_name} TO {username};")
def revoke_permissions(cur):
"""Revoke specific permissions to create missing permission findings"""
for username, table_name, privilege in PERMISSIONS_TO_REVOKE:
cur.execute(f"REVOKE {privilege} ON {table_name} FROM {username};")
def grant_sequence_permissions(cur):
"""Grant sequence permissions for users that need them"""
users_needing_sequences = ['customer_service', 'product_manager']
for username in users_needing_sequences:
cur.execute(f"GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO {username};")
def setup_security_environment():
"""
Set up a security-focused PostgreSQL environment with business tables and users with various permissions.
Creates a scenario where some users have dangling or insufficient permissions for realistic security analysis.
"""
# Database connection parameters from environment
db_params = {
'host': os.getenv('POSTGRES_HOST', 'localhost'),
'port': os.getenv('POSTGRES_PORT', '5432'),
'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
'password': os.getenv('POSTGRES_PASSWORD', 'password'),
'database': os.getenv('POSTGRES_DATABASE', 'postgres')
}
postgres_params = db_params.copy()
postgres_params['database'] = 'postgres'
try:
conn_postgres = psycopg2.connect(**postgres_params)
conn_postgres.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur_postgres = conn_postgres.cursor()
current_db = db_params['database']
cur_postgres.execute("SELECT datname FROM pg_database WHERE datname LIKE %s AND datname != %s;", ('%user_permission_audit%', current_db))
audit_databases = cur_postgres.fetchall()
if audit_databases:
for db_row in audit_databases:
db_name = db_row[0]
try:
cur_postgres.execute("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE datname = %s;", (db_name,))
cur_postgres.execute(f"DROP DATABASE IF EXISTS {db_name};")
print(f"Dropped database: {db_name}")
except Exception as e:
print(f"Warning: Could not drop database {db_name}: {e}")
# Clean up existing users
for username in USER_CONFIGS.keys():
try:
cur_postgres.execute("SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE usename = %s;", (username,))
cur_postgres.execute(f"DROP USER IF EXISTS {username};")
except Exception as e:
print(f"Warning: Could not drop user {username}: {e}")
cur_postgres.close()
conn_postgres.close()
except Exception as e:
print(f"Warning: Could not clean up users: {e}")
try:
conn = psycopg2.connect(**db_params)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cur = conn.cursor()
print("Setting up security audit environment...")
# Create business tables with realistic structure
create_business_tables(cur)
print("Created 7 business tables")
# Create users
create_users(cur)
active_count = len([u for u in USER_CONFIGS.values() if u['status'] == 'active'])
inactive_count = len([u for u in USER_CONFIGS.values() if u['status'] == 'inactive'])
print(f"Created {len(USER_CONFIGS)} users ({active_count} functional, {inactive_count} dangling)")
# Grant expected permissions
grant_expected_permissions(cur)
# Grant excessive permissions that will be flagged as issues
grant_excessive_permissions(cur)
print("Granted initial permissions")
# Revoke specific permissions to create missing permission findings
revoke_permissions(cur)
# Grant sequence permissions where needed
grant_sequence_permissions(cur)
cur.close()
conn.close()
except Exception as e:
print(f"Error setting up environment: {e}")
sys.exit(1)
if __name__ == "__main__":
setup_security_environment()
================================================
FILE: tasks/postgres/standard/security/user_permission_audit/verify.py
================================================
import os
import psycopg2
import sys
def verify_security_audit():
"""
Verify that the security audit correctly identified all permission issues.
"""
# Database connection parameters from environment
db_params = {
'host': os.getenv('POSTGRES_HOST', 'localhost'),
'port': os.getenv('POSTGRES_PORT', '5432'),
'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
'password': os.getenv('POSTGRES_PASSWORD', 'password'),
'database': os.getenv('POSTGRES_DATABASE', 'postgres')
}
try:
conn = psycopg2.connect(**db_params)
cur = conn.cursor()
print("| Verifying security audit findings...")
# Check if security_audit_results table exists
cur.execute("""
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'security_audit_results'
);
""")
if not cur.fetchone()[0]:
print("FAIL: security_audit_results table not found")
return False
# Check if security_audit_details table exists
cur.execute("""
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'security_audit_details'
);
""")
if not cur.fetchone()[0]:
print("FAIL: security_audit_details table not found")
return False
# Get all detailed findings
cur.execute("SELECT * FROM security_audit_details ORDER BY detail_id;")
findings = cur.fetchall()
if not findings:
print("FAIL: No findings in security_audit_details table")
return False
print(f"| Found {len(findings)} audit findings")
# Expected findings based on the ground truth:
expected_findings = {
# Expected dangling users
'dangling_users': {'temp_contractor', 'old_employee', 'test_account'},
# Expected missing permissions (should be granted)
'missing_permissions': {
('analytics_user', 'user_profiles', 'SELECT'),
('analytics_user', 'product_catalog', 'SELECT'),
('analytics_user', 'order_management', 'SELECT'),
('marketing_user', 'product_catalog', 'SELECT'),
('customer_service', 'product_catalog', 'SELECT'),
('finance_user', 'user_profiles', 'SELECT'),
('product_manager', 'user_stat_analysis', 'SELECT'),
('security_auditor', 'audit_logs', 'SELECT'),
('developer_user', 'product_catalog', 'SELECT'),
('backup_user', 'order_management', 'SELECT'),
('backup_user', 'financial_transactions', 'SELECT'),
('backup_user', 'user_stat_analysis', 'SELECT'),
('backup_user', 'user_credentials', 'SELECT')
},
# Expected excessive permissions (should be revoked)
'excessive_permissions': {
('analytics_user', 'financial_transactions', 'SELECT'),
('marketing_user', 'financial_transactions', 'SELECT'),
('customer_service', 'user_credentials', 'SELECT'),
('product_manager', 'financial_transactions', 'SELECT'),
('security_auditor', 'financial_transactions', 'UPDATE'),
('developer_user', 'user_credentials', 'SELECT'),
('developer_user', 'order_management', 'UPDATE'),
('backup_user', 'product_catalog', 'DELETE'),
('temp_contractor', 'product_catalog', 'SELECT'),
('temp_contractor', 'user_profiles', 'SELECT'),
('old_employee', 'audit_logs', 'SELECT'),
('old_employee', 'user_stat_analysis', 'UPDATE'),
('test_account', 'user_profiles', 'SELECT')
}
}
found_dangling = set()
found_missing_permissions = set()
found_excessive_permissions = set()
# Analyze findings (detail_id, username, issue_type, table_name, permission_type, expected_access)
for finding in findings:
username = finding[1]
issue_type = finding[2]
table_name = finding[3]
permission_type = finding[4]
expected_access = finding[5]
if issue_type == 'DANGLING_USER':
found_dangling.add(username)
elif issue_type == 'MISSING_PERMISSION' and expected_access:
if table_name and permission_type:
found_missing_permissions.add((username, table_name, permission_type))
elif issue_type == 'EXCESSIVE_PERMISSION' and not expected_access:
if table_name and permission_type:
found_excessive_permissions.add((username, table_name, permission_type))
# Verify dangling users
missing_dangling = expected_findings['dangling_users'] - found_dangling
extra_dangling = found_dangling - expected_findings['dangling_users']
# Verify missing permissions
missing_missing_perms = expected_findings['missing_permissions'] - found_missing_permissions
extra_missing_perms = found_missing_permissions - expected_findings['missing_permissions']
# Verify excessive permissions
missing_excessive_perms = expected_findings['excessive_permissions'] - found_excessive_permissions
extra_excessive_perms = found_excessive_permissions - expected_findings['excessive_permissions']
# Validate structure
structure_valid = True
for i, finding in enumerate(findings):
if len(finding) != 6: # Should have 6 columns
print(f"| FAIL: Finding {i + 1} has wrong number of columns (expected 6, got {len(finding)})")
structure_valid = False
continue
detail_id, username, issue_type, table_name, permission_type, expected_access = finding
if not username:
print(f"| FAIL: Finding {i + 1} missing username")
structure_valid = False
if issue_type not in ['DANGLING_USER', 'MISSING_PERMISSION', 'EXCESSIVE_PERMISSION']:
print(f"| FAIL: Finding {i + 1} invalid issue_type: {issue_type}")
structure_valid = False
if expected_access not in [True, False]:
print(f"| FAIL: Finding {i + 1} invalid expected_access: {expected_access}")
structure_valid = False
if structure_valid:
print(f"| ✓ structure is valid")
# Check for missing findings
all_correct = True
print(f"| Expected dangling users: {expected_findings['dangling_users']} Found: {found_dangling}")
if missing_dangling:
print(f"| Missing dangling users: {missing_dangling}")
all_correct = False
print(
f"| Expected missing permissions: {len(expected_findings['missing_permissions'])} Found: {len(found_missing_permissions)} Missing: {len(missing_missing_perms)}")
if missing_missing_perms:
print(f"| Missing 'missing permission' findings:")
for perm in sorted(missing_missing_perms):
print(f"| - {perm[0]} should be granted {perm[2]} on {perm[1]}")
all_correct = False
print(
f"| Expected excessive permissions: {len(expected_findings['excessive_permissions'])} Found: {len(found_excessive_permissions)} Missing: {len(missing_excessive_perms)}")
if missing_excessive_perms:
print(f"| Missing 'excessive permission' findings:")
for perm in sorted(missing_excessive_perms):
print(f"| - {perm[0]} should have {perm[2]} revoked on {perm[1]}")
all_correct = False
# Check audit summary table
cur.execute(
"SELECT audit_type, total_issues, users_affected, tables_affected FROM security_audit_results ORDER BY audit_type;")
summary_results = cur.fetchall()
# Expected summary numbers based on ground truth
expected_summary = {
'DANGLING_USERS': (3, 3, 0), # 3 issues, 3 users affected, 0 tables affected
'EXCESSIVE_PERMISSIONS': (13, 10, 7), # 13 issues, 10 users affected, 7 tables affected
'MISSING_PERMISSIONS': (13, 8, 7) # 13 issues, 8 users affected, 7 tables affected
}
summary_correct = True
for result in summary_results:
audit_type, total_issues, users_affected, tables_affected = result
print(f"| Summary result: [{audit_type}] {total_issues} issues, {users_affected} users affected, {tables_affected} tables affected")
if audit_type in expected_summary:
expected = expected_summary[audit_type]
if (total_issues, users_affected, tables_affected) != expected:
print(f"| FAIL: {audit_type} summary mismatch - Expected: {expected}, Got: ({total_issues}, {users_affected}, {tables_affected})")
summary_correct = False
else:
print(f"| ✓ {audit_type} summary matches expected values")
# Assert exact counts match expected
assert len(found_dangling) == 3, f"Expected 3 dangling users, found {len(found_dangling)}"
assert len(found_missing_permissions) == 13, f"Expected 13 missing permissions, found {len(found_missing_permissions)}"
assert len(found_excessive_permissions) == 13, f"Expected 13 excessive permissions, found {len(found_excessive_permissions)}"
if all_correct and structure_valid and summary_correct:
print("| ✓ All assertions passed")
return True
else:
return False
except Exception as e:
print(f"FAIL: Error during verification: {e}")
return False
finally:
if 'cur' in locals():
cur.close()
if 'conn' in locals():
conn.close()
if __name__ == "__main__":
success = verify_security_audit()
sys.exit(0 if success else 1)
================================================
FILE: tasks/postgres/standard/sports/baseball_player_analysis/description.md
================================================
Create comprehensive baseball player performance analysis in the sports database.
## Background
You are a sports analyst working with a comprehensive sports database. The analytics team needs to create a detailed analysis of baseball players by combining their offensive and defensive statistics with personal information. Currently, this data is scattered across multiple tables and needs to be consolidated for reporting purposes.
## Your Task
Create a table called `baseball_player_analysis` that consolidates baseball player performance data. The table should provide a comprehensive view of each qualifying player's performance metrics.
### Table Structure
Create the `baseball_player_analysis` table with the following columns:
- `player_id` (INTEGER, NOT NULL) - Player identifier
- `player_name` (VARCHAR(255), NOT NULL) - Player's full name
- `team_name` (VARCHAR(255)) - Set to 'Unknown' for all players
- `games_played` (INTEGER) - Number of games/events the player participated in
- `at_bats` (INTEGER) - Total at-bats for the player
- `hits` (INTEGER) - Total hits for the player
- `runs_scored` (INTEGER) - Total runs scored by the player
- `rbi` (INTEGER) - Total runs batted in by the player
- `home_runs` (INTEGER) - Total home runs hit by the player
- `batting_average` (DECIMAL) - Calculated as hits/at_bats
- `defensive_games` (INTEGER) - Number of defensive games played (same as games_played)
- `putouts` (INTEGER) - Total putouts in defensive play
- `assists` (INTEGER) - Total assists in defensive play
- `errors` (INTEGER) - Total errors made in defensive play
- `fielding_percentage` (DECIMAL) - Calculated as (putouts + assists)/(putouts + assists + errors)
### Data Requirements
Include only baseball players that meet ALL of the following criteria:
- Have offensive statistics available for regular season play
- Have played at least 10 games/events
- Have at least 50 at-bats
- Have a valid name available in the system
### Important Notes
- Focus on regular season statistics only
- Handle NULL values appropriately in calculations (use 0 for missing stats)
- Ensure batting average and fielding percentage calculations handle division by zero
- Do NOT use ROUND functions - keep the full precision of calculated values
- Sort results by batting average descending, then by games played descending
## Requirements
- Explore the database to understand the table structure and relationships
- Create the table with the exact structure specified above
- Populate the table using appropriate queries and joins
- Ensure all calculations are mathematically correct
- Handle edge cases properly (division by zero, NULL values)
================================================
FILE: tasks/postgres/standard/sports/baseball_player_analysis/meta.json
================================================
{
"task_id": "baseball_player_analysis",
"task_name": "Baseball Player Analysis",
"category_id": "sports",
"category_name": "Sports",
"description": "Consolidate scattered baseball player data into comprehensive analysis table combining offensive and defensive statistics.",
"author": "Lingxiao Du",
"created_at": "2025-08-18",
"difficulty": "L3",
"tags": [
"reporting and analytics",
"statistical aggregation",
"schema design"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"addresses\" {\n \"id\" int4 [not null, increment]\n \"location_id\" int4 [not null]\n \"language\" varchar(100)\n \"suite\" varchar(100)\n \"floor\" varchar(100)\n \"building\" varchar(100)\n \"street_number\" varchar(100)\n \"street_prefix\" varchar(100)\n \"street\" varchar(100)\n \"street_suffix\" varchar(100)\n \"neighborhood\" varchar(100)\n \"district\" varchar(100)\n \"locality\" varchar(100)\n \"county\" varchar(100)\n \"region\" varchar(100)\n \"postal_code\" varchar(100)\n \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"ancestor_affiliation_id\" int4\n \"start_season_id\" int4\n \"start_date_time\" timestamp\n \"end_season_id\" int4\n \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n \"id\" int4 [not null, increment]\n \"affiliation_key\" varchar(100) [not null]\n \"affiliation_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n \"affiliation_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n \"affiliation_id\" int4 [not null]\n \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n \"affiliation_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n \"id\" int4 [not null, increment]\n \"american_football_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"score_type\" varchar(100)\n \"field_line\" int4\n \"yardage\" int4\n \"score_credit\" int4\n \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n \"id\" int4 [not null, increment]\n \"american_football_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"drive_result\" varchar(100)\n \"points\" int4\n \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"tackles_total\" varchar(100)\n \"tackles_solo\" varchar(100)\n \"tackles_assists\" varchar(100)\n \"interceptions_total\" varchar(100)\n \"interceptions_yards\" varchar(100)\n \"interceptions_average\" varchar(100)\n \"interceptions_longest\" varchar(100)\n \"interceptions_touchdown\" varchar(100)\n \"quarterback_hurries\" varchar(100)\n \"sacks_total\" varchar(100)\n \"sacks_yards\" varchar(100)\n \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n \"id\" int4 [not null, increment]\n \"first_downs_total\" varchar(100)\n \"first_downs_pass\" varchar(100)\n \"first_downs_run\" varchar(100)\n \"first_downs_penalty\" varchar(100)\n \"conversions_third_down\" varchar(100)\n \"conversions_third_down_attempts\" varchar(100)\n \"conversions_third_down_percentage\" varchar(100)\n \"conversions_fourth_down\" varchar(100)\n \"conversions_fourth_down_attempts\" varchar(100)\n \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"period_value\" int4\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"clock_state\" varchar(100)\n \"down\" int4\n \"team_in_possession_id\" int4\n \"distance_for_1st_down\" int4\n \"field_side\" varchar(100)\n \"field_line\" int4\n \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n \"id\" int4 [not null, increment]\n \"fumbles_committed\" varchar(100)\n \"fumbles_forced\" varchar(100)\n \"fumbles_recovered\" varchar(100)\n \"fumbles_lost\" varchar(100)\n \"fumbles_yards_gained\" varchar(100)\n \"fumbles_own_committed\" varchar(100)\n \"fumbles_own_recovered\" varchar(100)\n \"fumbles_own_lost\" varchar(100)\n \"fumbles_own_yards_gained\" varchar(100)\n \"fumbles_opposing_committed\" varchar(100)\n \"fumbles_opposing_recovered\" varchar(100)\n \"fumbles_opposing_lost\" varchar(100)\n \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"offensive_plays_yards\" varchar(100)\n \"offensive_plays_number\" varchar(100)\n \"offensive_plays_average_yards_per\" varchar(100)\n \"possession_duration\" varchar(100)\n \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n \"id\" int4 [not null, increment]\n \"passes_attempts\" varchar(100)\n \"passes_completions\" varchar(100)\n \"passes_percentage\" varchar(100)\n \"passes_yards_gross\" varchar(100)\n \"passes_yards_net\" varchar(100)\n \"passes_yards_lost\" varchar(100)\n \"passes_touchdowns\" varchar(100)\n \"passes_touchdowns_percentage\" varchar(100)\n \"passes_interceptions\" varchar(100)\n \"passes_interceptions_percentage\" varchar(100)\n \"passes_longest\" varchar(100)\n \"passes_average_yards_per\" varchar(100)\n \"passer_rating\" varchar(100)\n \"receptions_total\" varchar(100)\n \"receptions_yards\" varchar(100)\n \"receptions_touchdowns\" varchar(100)\n \"receptions_first_down\" varchar(100)\n \"receptions_longest\" varchar(100)\n \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n \"id\" int4 [not null, increment]\n \"penalties_total\" varchar(100)\n \"penalty_yards\" varchar(100)\n \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n \"id\" int4 [not null, increment]\n \"rushes_attempts\" varchar(100)\n \"rushes_yards\" varchar(100)\n \"rushes_touchdowns\" varchar(100)\n \"rushing_average_yards_per\" varchar(100)\n \"rushes_first_down\" varchar(100)\n \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n \"id\" int4 [not null, increment]\n \"sacks_against_yards\" varchar(100)\n \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n \"id\" int4 [not null, increment]\n \"touchdowns_total\" varchar(100)\n \"touchdowns_passing\" varchar(100)\n \"touchdowns_rushing\" varchar(100)\n \"touchdowns_special_teams\" varchar(100)\n \"touchdowns_defensive\" varchar(100)\n \"extra_points_attempts\" varchar(100)\n \"extra_points_made\" varchar(100)\n \"extra_points_missed\" varchar(100)\n \"extra_points_blocked\" varchar(100)\n \"field_goal_attempts\" varchar(100)\n \"field_goals_made\" varchar(100)\n \"field_goals_missed\" varchar(100)\n \"field_goals_blocked\" varchar(100)\n \"safeties_against\" varchar(100)\n \"two_point_conversions_attempts\" varchar(100)\n \"two_point_conversions_made\" varchar(100)\n \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_punt_total\" varchar(100)\n \"returns_punt_yards\" varchar(100)\n \"returns_punt_average\" varchar(100)\n \"returns_punt_longest\" varchar(100)\n \"returns_punt_touchdown\" varchar(100)\n \"returns_kickoff_total\" varchar(100)\n \"returns_kickoff_yards\" varchar(100)\n \"returns_kickoff_average\" varchar(100)\n \"returns_kickoff_longest\" varchar(100)\n \"returns_kickoff_touchdown\" varchar(100)\n \"returns_total\" varchar(100)\n \"returns_yards\" varchar(100)\n \"punts_total\" varchar(100)\n \"punts_yards_gross\" varchar(100)\n \"punts_yards_net\" varchar(100)\n \"punts_longest\" varchar(100)\n \"punts_inside_20\" varchar(100)\n \"punts_inside_20_percentage\" varchar(100)\n \"punts_average\" varchar(100)\n \"punts_blocked\" varchar(100)\n \"touchbacks_total\" varchar(100)\n \"touchbacks_total_percentage\" varchar(100)\n \"touchbacks_kickoffs\" varchar(100)\n \"touchbacks_kickoffs_percentage\" varchar(100)\n \"touchbacks_punts\" varchar(100)\n \"touchbacks_punts_percentage\" varchar(100)\n \"touchbacks_interceptions\" varchar(100)\n \"touchbacks_interceptions_percentage\" varchar(100)\n \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_pitch_id\" int4 [not null]\n \"location\" varchar(100)\n \"strength\" varchar(100)\n \"velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_play_id\" int4 [not null]\n \"sequence_number\" int4\n \"baseball_defensive_group_id\" int4\n \"umpire_call\" varchar(100)\n \"pitch_location\" varchar(100)\n \"pitch_type\" varchar(100)\n \"pitch_velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n \"ball_type\" varchar(40)\n \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"notation\" varchar(100)\n \"notation_yaml\" text\n \"baseball_defensive_group_id\" int4\n \"comment\" varchar(255)\n \"runner_on_first_advance\" int4\n \"runner_on_second_advance\" int4\n \"runner_on_third_advance\" int4\n \"outs_recorded\" int4\n \"rbi\" int4\n \"runs_scored\" int4\n \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"sequence_number\" int4\n \"person_type\" varchar(100)\n \"person_original_id\" int4\n \"person_original_position_id\" int4\n \"person_original_lineup_slot\" int4\n \"person_replacing_id\" int4\n \"person_replacing_position_id\" int4\n \"person_replacing_lineup_slot\" int4\n \"substitution_reason\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n \"id\" int4 [not null, increment]\n \"baseball_defensive_group_id\" int4 [not null]\n \"player_id\" int4 [not null]\n \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"double_plays\" int4\n \"triple_plays\" int4\n \"putouts\" int4\n \"assists\" int4\n \"errors\" int4\n \"fielding_percentage\" numeric\n \"defensive_average\" numeric\n \"errors_passed_ball\" int4\n \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"at_bat_number\" int4\n \"inning_value\" int4\n \"inning_half\" varchar(100)\n \"outs\" int4\n \"balls\" int4\n \"strikes\" int4\n \"runner_on_first_id\" int4\n \"runner_on_second_id\" int4\n \"runner_on_third_id\" int4\n \"runner_on_first\" int2\n \"runner_on_second\" int2\n \"runner_on_third\" int2\n \"runs_this_inning_half\" int4\n \"pitcher_id\" int4\n \"batter_id\" int4\n \"batter_side\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"average\" numeric\n \"runs_scored\" int4\n \"at_bats\" int4\n \"hits\" int4\n \"rbi\" int4\n \"total_bases\" int4\n \"slugging_percentage\" numeric\n \"bases_on_balls\" int4\n \"strikeouts\" int4\n \"left_on_base\" int4\n \"left_in_scoring_position\" int4\n \"singles\" int4\n \"doubles\" int4\n \"triples\" int4\n \"home_runs\" int4\n \"grand_slams\" int4\n \"at_bats_per_rbi\" numeric\n \"plate_appearances_per_rbi\" numeric\n \"at_bats_per_home_run\" numeric\n \"plate_appearances_per_home_run\" numeric\n \"sac_flies\" int4\n \"sac_bunts\" int4\n \"grounded_into_double_play\" int4\n \"moved_up\" int4\n \"on_base_percentage\" numeric\n \"stolen_bases\" int4\n \"stolen_bases_caught\" int4\n \"stolen_bases_average\" numeric\n \"hit_by_pitch\" int4\n \"defensive_interferance_reaches\" int4\n \"on_base_plus_slugging\" numeric\n \"plate_appearances\" int4\n \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n \"id\" int4 [not null, increment]\n \"runs_allowed\" int4\n \"singles_allowed\" int4\n \"doubles_allowed\" int4\n \"triples_allowed\" int4\n \"home_runs_allowed\" int4\n \"innings_pitched\" varchar(20)\n \"hits\" int4\n \"earned_runs\" int4\n \"unearned_runs\" int4\n \"bases_on_balls\" int4\n \"bases_on_balls_intentional\" int4\n \"strikeouts\" int4\n \"strikeout_to_bb_ratio\" numeric\n \"number_of_pitches\" int4\n \"era\" numeric\n \"inherited_runners_scored\" int4\n \"pick_offs\" int4\n \"errors_hit_with_pitch\" int4\n \"errors_wild_pitch\" int4\n \"balks\" int4\n \"wins\" int4\n \"losses\" int4\n \"saves\" int4\n \"shutouts\" int4\n \"games_complete\" int4\n \"games_finished\" int4\n \"winning_percentage\" numeric\n \"event_credit\" varchar(40)\n \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"steals_total\" varchar(100)\n \"steals_per_game\" varchar(100)\n \"blocks_total\" varchar(100)\n \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"field_goals_made\" int4\n \"field_goals_attempted\" int4\n \"field_goals_percentage\" varchar(100)\n \"field_goals_per_game\" varchar(100)\n \"field_goals_attempted_per_game\" varchar(100)\n \"field_goals_percentage_adjusted\" varchar(100)\n \"three_pointers_made\" int4\n \"three_pointers_attempted\" int4\n \"three_pointers_percentage\" varchar(100)\n \"three_pointers_per_game\" varchar(100)\n \"three_pointers_attempted_per_game\" varchar(100)\n \"free_throws_made\" varchar(100)\n \"free_throws_attempted\" varchar(100)\n \"free_throws_percentage\" varchar(100)\n \"free_throws_per_game\" varchar(100)\n \"free_throws_attempted_per_game\" varchar(100)\n \"points_scored_total\" varchar(100)\n \"points_scored_per_game\" varchar(100)\n \"assists_total\" varchar(100)\n \"assists_per_game\" varchar(100)\n \"turnovers_total\" varchar(100)\n \"turnovers_per_game\" varchar(100)\n \"points_scored_off_turnovers\" varchar(100)\n \"points_scored_in_paint\" varchar(100)\n \"points_scored_on_second_chance\" varchar(100)\n \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n \"id\" int4 [not null, increment]\n \"rebounds_total\" varchar(100)\n \"rebounds_per_game\" varchar(100)\n \"rebounds_defensive\" varchar(100)\n \"rebounds_offensive\" varchar(100)\n \"team_rebounds_total\" varchar(100)\n \"team_rebounds_per_game\" varchar(100)\n \"team_rebounds_defensive\" varchar(100)\n \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n \"id\" int4 [not null, increment]\n \"timeouts_left\" varchar(100)\n \"largest_lead\" varchar(100)\n \"fouls_total\" varchar(100)\n \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n \"id\" int4 [not null, increment]\n \"time_played_event\" varchar(40)\n \"time_played_total\" varchar(40)\n \"time_played_event_average\" varchar(40)\n \"events_played\" int4\n \"events_started\" int4\n \"position_id\" int4\n}\n\nTable \"core_stats\" {\n \"id\" int4 [not null, increment]\n \"score\" varchar(100)\n \"score_opposing\" varchar(100)\n \"score_attempts\" varchar(100)\n \"score_attempts_opposing\" varchar(100)\n \"score_percentage\" varchar(100)\n \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n \"id\" int4 [not null, increment]\n \"language\" varchar(100) [not null]\n \"entity_type\" varchar(100) [not null]\n \"entity_id\" int4 [not null]\n \"full_name\" varchar(100)\n \"first_name\" varchar(100)\n \"middle_name\" varchar(100)\n \"last_name\" varchar(100)\n \"alias\" varchar(100)\n \"abbreviation\" varchar(100)\n \"short_name\" varchar(100)\n \"prefix\" varchar(20)\n \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n \"id\" int4 [not null, increment]\n \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"sportsml\" varchar(200)\n \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n \"id\" int4 [not null, increment]\n \"fixture_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"name\" varchar(100)\n \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n \"id\" int4 [not null, increment]\n \"document_fixture_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"latest_document_id\" int4 [not null]\n \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n \"id\" int4 [not null, increment]\n \"document_package_id\" int4 [not null]\n \"rank\" varchar(100)\n \"document_id\" int4 [not null]\n \"headline\" varchar(100)\n \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n \"id\" int4 [not null, increment]\n \"package_key\" varchar(100)\n \"package_name\" varchar(100)\n \"date_time\" date\n}\n\nTable \"documents\" {\n \"id\" int4 [not null, increment]\n \"doc_id\" varchar(75) [not null]\n \"publisher_id\" int4 [not null]\n \"date_time\" timestamp\n \"title\" varchar(255)\n \"language\" varchar(100)\n \"priority\" varchar(100)\n \"revision_id\" varchar(75)\n \"stats_coverage\" varchar(100)\n \"document_fixture_id\" int4 [not null]\n \"source_id\" int4\n \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"media_id\" int4 [not null]\n \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n \"id\" int4 [not null, increment]\n \"event_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"site_id\" int4\n \"site_alignment\" varchar(100)\n \"event_status\" varchar(100)\n \"duration\" varchar(100)\n \"attendance\" varchar(100)\n \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n \"event_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n \"event_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n \"event_id\" int4 [not null]\n \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"play_result\" varchar(100)\n \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_power_play_allowed\" varchar(100)\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_power_play_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"penalty_killing_amount\" varchar(100)\n \"penalty_killing_percentage\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"takeaways\" varchar(100)\n \"shutouts\" varchar(100)\n \"minutes_penalty_killing\" varchar(100)\n \"hits\" varchar(100)\n \"goals_empty_net_allowed\" varchar(100)\n \"goals_short_handed_allowed\" varchar(100)\n \"goals_shootout_allowed\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_power_play\" varchar(100)\n \"goals_short_handed\" varchar(100)\n \"goals_even_strength\" varchar(100)\n \"goals_empty_net\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_penalty_shot\" varchar(100)\n \"assists\" varchar(100)\n \"points\" varchar(100)\n \"power_play_amount\" varchar(100)\n \"power_play_percentage\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(100)\n \"shots_penalty_shot_percentage\" varchar(100)\n \"giveaways\" varchar(100)\n \"minutes_power_play\" varchar(100)\n \"faceoff_wins\" varchar(100)\n \"faceoff_losses\" varchar(100)\n \"faceoff_win_percentage\" varchar(100)\n \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n \"id\" int4 [not null, increment]\n \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"injury_status\" varchar(100)\n \"injury_type\" varchar(100)\n \"injury_comment\" varchar(100)\n \"disabled_list\" varchar(100)\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n \"season_id\" int4\n \"phase_type\" varchar(100)\n \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n \"id\" int4 [not null, increment]\n \"key_id\" int4 [not null]\n \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n \"id\" int4 [not null, increment]\n \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n \"id\" int4 [not null, increment]\n \"revision_id\" varchar(75) [not null]\n \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n \"id\" int4 [not null, increment]\n \"timezone\" varchar(100)\n \"latitude\" varchar(100)\n \"longitude\" varchar(100)\n \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n \"id\" int4 [not null, increment]\n \"object_id\" int4\n \"source_id\" int4\n \"revision_id\" int4\n \"media_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"date_time\" varchar(100)\n \"credit_id\" int4 [not null]\n \"db_loading_date_time\" timestamp\n \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"caption_type\" varchar(100)\n \"caption\" varchar(100)\n \"caption_author_id\" int4 [not null]\n \"language\" varchar(100)\n \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"object\" varchar(100)\n \"format\" varchar(100)\n \"mime_type\" varchar(100)\n \"height\" varchar(100)\n \"width\" varchar(100)\n \"duration\" varchar(100)\n \"file_size\" varchar(100)\n \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n \"id\" int4 [not null, increment]\n \"keyword\" varchar(100)\n \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"lap\" varchar(100)\n \"laps_remaining\" varchar(100)\n \"time_elapsed\" varchar(100)\n \"flag_state\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n \"id\" int4 [not null, increment]\n \"grid\" varchar(100)\n \"pole_position\" varchar(100)\n \"pole_wins\" varchar(100)\n \"qualifying_speed\" varchar(100)\n \"qualifying_speed_units\" varchar(100)\n \"qualifying_time\" varchar(100)\n \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n \"id\" int4 [not null, increment]\n \"time_behind_leader\" varchar(100)\n \"laps_behind_leader\" varchar(100)\n \"time_ahead_follower\" varchar(100)\n \"laps_ahead_follower\" varchar(100)\n \"time\" varchar(100)\n \"points\" varchar(100)\n \"points_rookie\" varchar(100)\n \"bonus\" varchar(100)\n \"laps_completed\" varchar(100)\n \"laps_leading_total\" varchar(100)\n \"distance_leading\" varchar(100)\n \"distance_completed\" varchar(100)\n \"distance_units\" varchar(40)\n \"speed_average\" varchar(40)\n \"speed_units\" varchar(40)\n \"status\" varchar(40)\n \"finishes_top_5\" varchar(40)\n \"finishes_top_10\" varchar(40)\n \"starts\" varchar(40)\n \"finishes\" varchar(40)\n \"non_finishes\" varchar(40)\n \"wins\" varchar(40)\n \"races_leading\" varchar(40)\n \"money\" varchar(40)\n \"money_units\" varchar(40)\n \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n \"id\" int4 [not null, increment]\n \"standing_subgroup_id\" int4 [not null]\n \"outcome_holder_type\" varchar(100)\n \"outcome_holder_id\" int4\n \"rank\" varchar(100)\n \"wins\" varchar(100)\n \"losses\" varchar(100)\n \"ties\" varchar(100)\n \"undecideds\" varchar(100)\n \"winning_percentage\" varchar(100)\n \"points_scored_for\" varchar(100)\n \"points_scored_against\" varchar(100)\n \"points_difference\" varchar(100)\n \"standing_points\" varchar(100)\n \"streak_type\" varchar(100)\n \"streak_duration\" varchar(100)\n \"streak_total\" varchar(100)\n \"streak_start\" date\n \"streak_end\" date\n}\n\nTable \"participants_events\" {\n \"id\" int4 [not null, increment]\n \"participant_type\" varchar(100) [not null]\n \"participant_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"alignment\" varchar(100)\n \"score\" varchar(100)\n \"event_outcome\" varchar(100)\n \"rank\" int4\n}\n\nTable \"periods\" {\n \"id\" int4 [not null, increment]\n \"participant_event_id\" int4 [not null]\n \"period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"status\" varchar(100)\n \"health\" varchar(100)\n \"weight\" varchar(100)\n \"role_id\" int4\n \"position_id\" int4\n \"team_id\" int4\n \"lineup_slot\" int4\n \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"membership_type\" varchar(40) [not null]\n \"membership_id\" int4 [not null]\n \"role_id\" int4\n \"role_status\" varchar(40)\n \"phase_status\" varchar(40)\n \"uniform_number\" varchar(20)\n \"regular_position_id\" int4\n \"regular_position_depth\" varchar(40)\n \"height\" varchar(100)\n \"weight\" varchar(100)\n \"start_date_time\" timestamp\n \"start_season_id\" int4\n \"end_date_time\" timestamp\n \"end_season_id\" int4\n \"entry_reason\" varchar(40)\n \"exit_reason\" varchar(40)\n \"selection_level\" int4\n \"selection_sublevel\" int4\n \"selection_overall\" int4\n}\n\nTable \"persons\" {\n \"id\" int4 [not null, increment]\n \"person_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"gender\" varchar(20)\n \"birth_date\" varchar(30)\n \"death_date\" varchar(30)\n \"birth_location_id\" int4\n \"hometown_location_id\" int4\n \"residence_location_id\" int4\n \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n \"person_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n \"person_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n \"id\" int4 [not null, increment]\n \"publisher_key\" varchar(100) [not null]\n \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n \"id\" int4 [not null, increment]\n \"role_key\" varchar(100) [not null]\n \"role_name\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n \"id\" int4 [not null, increment]\n \"season_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"league_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n \"id\" int4 [not null, increment]\n \"site_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"goals_against_total\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"catches_punches\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_shootout_total\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"minutes_elapsed\" varchar(100)\n \"period_minute_elapsed\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n \"id\" int4 [not null, increment]\n \"fouls_suffered\" varchar(100)\n \"fouls_commited\" varchar(100)\n \"cautions_total\" varchar(100)\n \"cautions_pending\" varchar(100)\n \"caution_points_total\" varchar(100)\n \"caution_points_pending\" varchar(100)\n \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_total\" varchar(100)\n \"assists_game_winning\" varchar(100)\n \"assists_game_tying\" varchar(100)\n \"assists_overtime\" varchar(100)\n \"assists_total\" varchar(100)\n \"points\" varchar(100)\n \"shots_total\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_hit_frame\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_scored\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(40)\n \"shots_penalty_shot_percentage\" varchar(40)\n \"shots_shootout_taken\" varchar(40)\n \"shots_shootout_scored\" varchar(40)\n \"shots_shootout_missed\" varchar(40)\n \"shots_shootout_percentage\" varchar(40)\n \"giveaways\" varchar(40)\n \"offsides\" varchar(40)\n \"corner_kicks\" varchar(40)\n \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n \"id\" int4 [not null, increment]\n \"standing_id\" int4 [not null]\n \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"standing_type\" varchar(100)\n \"sub_season_id\" int4 [not null]\n \"last_updated\" varchar(100)\n \"duration_scope\" varchar(100)\n \"competition_scope\" varchar(100)\n \"competition_scope_id\" varchar(100)\n \"alignment_scope\" varchar(100)\n \"site_scope\" varchar(100)\n \"scoping_label\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"source\" varchar(100)\n}\n\nTable \"stats\" {\n \"id\" int4 [not null, increment]\n \"stat_repository_type\" varchar(100)\n \"stat_repository_id\" int4 [not null]\n \"stat_holder_type\" varchar(100)\n \"stat_holder_id\" int4\n \"stat_coverage_type\" varchar(100)\n \"stat_coverage_id\" int4\n \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n \"id\" int4 [not null, increment]\n \"period_id\" int4 [not null]\n \"sub_period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n \"id\" int4 [not null, increment]\n \"sub_season_key\" varchar(100) [not null]\n \"season_id\" int4 [not null]\n \"sub_season_type\" varchar(100) [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n \"id\" int4 [not null, increment]\n \"yards_per_attempt\" varchar(100)\n \"average_starting_position\" varchar(100)\n \"timeouts\" varchar(100)\n \"time_of_possession\" varchar(100)\n \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n \"id\" int4 [not null, increment]\n \"team_id\" int4 [not null]\n \"start_season_id\" int4\n \"end_season_id\" int4\n \"affiliation_id\" int4 [not null]\n \"start_date_time\" varchar(100)\n \"end_date_time\" varchar(100)\n \"phase_status\" varchar(40)\n \"role_id\" int4\n}\n\nTable \"teams\" {\n \"id\" int4 [not null, increment]\n \"team_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n \"team_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n \"team_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n \"id\" int4 [not null, increment]\n \"sub_period_id\" varchar(100)\n \"sequence_number\" varchar(100)\n \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n \"id\" int4 [not null, increment]\n \"sequence_number\" varchar(100)\n \"tennis_action_points_id\" int4\n \"landing_location\" varchar(100)\n \"swing_type\" varchar(100)\n \"result\" varchar(100)\n \"spin_type\" varchar(100)\n \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"tennis_set\" varchar(100)\n \"game\" varchar(100)\n \"server_person_id\" int4\n \"server_score\" varchar(100)\n \"receiver_person_id\" int4\n \"receiver_score\" varchar(100)\n \"service_number\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"first_service_return_points_won\" varchar(100)\n \"first_service_return_points_won_pct\" varchar(100)\n \"second_service_return_points_won\" varchar(100)\n \"second_service_return_points_won_pct\" varchar(100)\n \"return_games_played\" varchar(100)\n \"return_games_won\" varchar(100)\n \"return_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_converted\" varchar(100)\n \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n \"id\" int4 [not null, increment]\n \"services_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"aces\" varchar(100)\n \"first_services_good\" varchar(100)\n \"first_services_good_pct\" varchar(100)\n \"first_service_points_won\" varchar(100)\n \"first_service_points_won_pct\" varchar(100)\n \"second_service_points_won\" varchar(100)\n \"second_service_points_won_pct\" varchar(100)\n \"service_games_played\" varchar(100)\n \"service_games_won\" varchar(100)\n \"service_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_saved\" varchar(100)\n \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"numerator\" varchar(100)\n \"denominator\" varchar(100)\n \"prediction\" varchar(100)\n \"payout_calculation\" varchar(100)\n \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"line_value\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_value\" varchar(100)\n \"line_value_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_over\" varchar(100)\n \"line_under\" varchar(100)\n \"total\" varchar(100)\n \"total_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"temperature\" varchar(100)\n \"temperature_units\" varchar(40)\n \"humidity\" varchar(100)\n \"clouds\" varchar(100)\n \"wind_direction\" varchar(100)\n \"wind_velocity\" varchar(100)\n \"weather_code\" varchar(100)\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql"
}
}
================================================
FILE: tasks/postgres/standard/sports/baseball_player_analysis/verify.py
================================================
"""
Verification script for PostgreSQL Sports Task 1: Baseball Player Analysis
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""Compare two rows with appropriate tolerance for decimals and floats."""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, (Decimal, float)) and isinstance(expected, (Decimal, float)):
# Use higher tolerance for floating point comparisons
if abs(float(actual) - float(expected)) > 0.001:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD")
}
def verify_baseball_player_analysis_table(conn) -> bool:
"""Verify the baseball_player_analysis table results."""
with conn.cursor() as cur:
cur.execute("""
SELECT player_id, player_name, team_name, games_played, at_bats, hits,
runs_scored, rbi, home_runs, batting_average, defensive_games,
putouts, assists, errors, fielding_percentage
FROM baseball_player_analysis
ORDER BY batting_average DESC, games_played DESC
""")
actual_results = cur.fetchall()
cur.execute("""
SELECT
p.id AS player_id,
MAX(dn.full_name) AS player_name,
'Unknown' AS team_name,
core.events_played AS games_played,
off.at_bats,
off.hits,
off.runs_scored,
off.rbi,
off.home_runs,
CASE WHEN off.at_bats > 0
THEN 1.0 * off.hits / off.at_bats
ELSE 0
END AS batting_average,
core.events_played AS defensive_games,
COALESCE(def.putouts, 0) AS putouts,
COALESCE(def.assists, 0) AS assists,
COALESCE(def.errors, 0) AS errors,
CASE
WHEN (COALESCE(def.putouts,0) + COALESCE(def.assists,0) + COALESCE(def.errors,0)) > 0
THEN 1.0 * (COALESCE(def.putouts,0) + COALESCE(def.assists,0))
/ (COALESCE(def.putouts,0) + COALESCE(def.assists,0) + COALESCE(def.errors,0))
ELSE 0
END AS fielding_percentage
FROM persons p
JOIN display_names dn
ON dn.entity_id = p.id
AND dn.entity_type = 'persons'
AND NULLIF(TRIM(dn.full_name), '') IS NOT NULL
JOIN (
SELECT s.stat_holder_id AS player_id,
SUM(bos.at_bats) AS at_bats,
SUM(bos.hits) AS hits,
SUM(bos.runs_scored) AS runs_scored,
SUM(bos.rbi) AS rbi,
SUM(bos.home_runs) AS home_runs
FROM stats s
JOIN baseball_offensive_stats bos
ON bos.id = s.stat_repository_id
WHERE s.stat_holder_type = 'persons'
AND s.stat_repository_type = 'baseball_offensive_stats'
AND s.context = 'season-regular'
GROUP BY s.stat_holder_id
) off ON off.player_id = p.id
JOIN (
SELECT s.stat_holder_id AS player_id,
SUM(cps.events_played) AS events_played
FROM stats s
JOIN core_person_stats cps
ON cps.id = s.stat_repository_id
WHERE s.stat_holder_type = 'persons'
AND s.stat_repository_type = 'core_person_stats'
AND s.context = 'season-regular'
GROUP BY s.stat_holder_id
) core ON core.player_id = p.id
LEFT JOIN (
SELECT s.stat_holder_id AS player_id,
SUM(bds.putouts) AS putouts,
SUM(bds.assists) AS assists,
SUM(bds.errors) AS errors
FROM stats s
JOIN baseball_defensive_stats bds
ON bds.id = s.stat_repository_id
WHERE s.stat_holder_type = 'persons'
AND s.stat_repository_type = 'baseball_defensive_stats'
AND s.context = 'season-regular'
GROUP BY s.stat_holder_id
) def ON def.player_id = p.id
WHERE core.events_played >= 10
AND off.at_bats >= 50
GROUP BY
p.id, core.events_played,
off.at_bats, off.hits, off.runs_scored, off.rbi, off.home_runs,
def.putouts, def.assists, def.errors
ORDER BY batting_average DESC, games_played DESC;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ baseball_player_analysis table has {len(actual_results)} records, expected {len(expected_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Player analysis row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total player analysis mismatches: {mismatches}")
return False
print(f"✅ baseball_player_analysis table created and populated correctly ({len(actual_results)} players)")
return True
def main():
"""Main verification function."""
print("=" * 70)
print("PostgreSQL Sports Task 1 Verification: Baseball Player Analysis")
print("=" * 70)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify results
success = verify_baseball_player_analysis_table(conn)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/sports/participant_report_optimization/description.md
================================================
# Query Performance Optimization
## Background
You need to optimize a slow-running analytics query that generates performance reports. The query currently takes too long to execute and needs optimization.
## Requirements
### 1. Create Performance Report Table
Create a table called `participant_performance_report` with the following structure:
- report_id (serial primary key)
- participant_id (integer not null)
- event_count (integer)
- stat_count (integer)
- stat_type_count (integer)
- last_event_date (timestamp)
- created_at (timestamp default current_timestamp)
Add constraint: CHECK (participant_id > 0)
### 2. Execute and Optimize the Slow Query
The following query is currently running very slowly. Your task is to:
1. **Identify why the query is slow**
2. **Create appropriate indexes to optimize it**
3. **Populate the report table with the query results**
```sql
SELECT
pe.participant_id,
COUNT(pe.event_id) as event_count,
(SELECT COUNT(*) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_count,
(SELECT COUNT(DISTINCT s.stat_repository_type) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_type_count,
(SELECT MAX(e.start_date_time) FROM events e JOIN participants_events pe2 ON e.id = pe2.event_id WHERE pe2.participant_id = pe.participant_id) as last_event_date
FROM participants_events pe
WHERE pe.participant_id <= 50
GROUP BY pe.participant_id
ORDER BY pe.participant_id;
```
### 3. Document Performance Improvement
After optimization, insert the results into your `participant_performance_report` table.
## Success Criteria
- The query should execute significantly faster after your optimization
- All results should be correctly inserted into the report table
- Your optimization should use appropriate database indexes
## Important Notes
- Analyze the query execution plan to identify bottlenecks
- Focus on the most impactful optimizations
- Handle NULL values appropriately in calculations
================================================
FILE: tasks/postgres/standard/sports/participant_report_optimization/meta.json
================================================
{
"task_id": "participant_report_optimization",
"task_name": "Participant Report Optimization",
"category_id": "sports",
"category_name": "Sports",
"description": "Optimize slow-running participant performance query by creating indexes and populating performance report table.",
"author": "Lingxiao Du",
"created_at": "2025-08-18",
"difficulty": "L3",
"tags": [
"performance optimization",
"schema design"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"addresses\" {\n \"id\" int4 [not null, increment]\n \"location_id\" int4 [not null]\n \"language\" varchar(100)\n \"suite\" varchar(100)\n \"floor\" varchar(100)\n \"building\" varchar(100)\n \"street_number\" varchar(100)\n \"street_prefix\" varchar(100)\n \"street\" varchar(100)\n \"street_suffix\" varchar(100)\n \"neighborhood\" varchar(100)\n \"district\" varchar(100)\n \"locality\" varchar(100)\n \"county\" varchar(100)\n \"region\" varchar(100)\n \"postal_code\" varchar(100)\n \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"ancestor_affiliation_id\" int4\n \"start_season_id\" int4\n \"start_date_time\" timestamp\n \"end_season_id\" int4\n \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n \"id\" int4 [not null, increment]\n \"affiliation_key\" varchar(100) [not null]\n \"affiliation_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n \"affiliation_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n \"affiliation_id\" int4 [not null]\n \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n \"affiliation_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n \"id\" int4 [not null, increment]\n \"american_football_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"score_type\" varchar(100)\n \"field_line\" int4\n \"yardage\" int4\n \"score_credit\" int4\n \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n \"id\" int4 [not null, increment]\n \"american_football_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"drive_result\" varchar(100)\n \"points\" int4\n \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"tackles_total\" varchar(100)\n \"tackles_solo\" varchar(100)\n \"tackles_assists\" varchar(100)\n \"interceptions_total\" varchar(100)\n \"interceptions_yards\" varchar(100)\n \"interceptions_average\" varchar(100)\n \"interceptions_longest\" varchar(100)\n \"interceptions_touchdown\" varchar(100)\n \"quarterback_hurries\" varchar(100)\n \"sacks_total\" varchar(100)\n \"sacks_yards\" varchar(100)\n \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n \"id\" int4 [not null, increment]\n \"first_downs_total\" varchar(100)\n \"first_downs_pass\" varchar(100)\n \"first_downs_run\" varchar(100)\n \"first_downs_penalty\" varchar(100)\n \"conversions_third_down\" varchar(100)\n \"conversions_third_down_attempts\" varchar(100)\n \"conversions_third_down_percentage\" varchar(100)\n \"conversions_fourth_down\" varchar(100)\n \"conversions_fourth_down_attempts\" varchar(100)\n \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"period_value\" int4\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"clock_state\" varchar(100)\n \"down\" int4\n \"team_in_possession_id\" int4\n \"distance_for_1st_down\" int4\n \"field_side\" varchar(100)\n \"field_line\" int4\n \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n \"id\" int4 [not null, increment]\n \"fumbles_committed\" varchar(100)\n \"fumbles_forced\" varchar(100)\n \"fumbles_recovered\" varchar(100)\n \"fumbles_lost\" varchar(100)\n \"fumbles_yards_gained\" varchar(100)\n \"fumbles_own_committed\" varchar(100)\n \"fumbles_own_recovered\" varchar(100)\n \"fumbles_own_lost\" varchar(100)\n \"fumbles_own_yards_gained\" varchar(100)\n \"fumbles_opposing_committed\" varchar(100)\n \"fumbles_opposing_recovered\" varchar(100)\n \"fumbles_opposing_lost\" varchar(100)\n \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"offensive_plays_yards\" varchar(100)\n \"offensive_plays_number\" varchar(100)\n \"offensive_plays_average_yards_per\" varchar(100)\n \"possession_duration\" varchar(100)\n \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n \"id\" int4 [not null, increment]\n \"passes_attempts\" varchar(100)\n \"passes_completions\" varchar(100)\n \"passes_percentage\" varchar(100)\n \"passes_yards_gross\" varchar(100)\n \"passes_yards_net\" varchar(100)\n \"passes_yards_lost\" varchar(100)\n \"passes_touchdowns\" varchar(100)\n \"passes_touchdowns_percentage\" varchar(100)\n \"passes_interceptions\" varchar(100)\n \"passes_interceptions_percentage\" varchar(100)\n \"passes_longest\" varchar(100)\n \"passes_average_yards_per\" varchar(100)\n \"passer_rating\" varchar(100)\n \"receptions_total\" varchar(100)\n \"receptions_yards\" varchar(100)\n \"receptions_touchdowns\" varchar(100)\n \"receptions_first_down\" varchar(100)\n \"receptions_longest\" varchar(100)\n \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n \"id\" int4 [not null, increment]\n \"penalties_total\" varchar(100)\n \"penalty_yards\" varchar(100)\n \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n \"id\" int4 [not null, increment]\n \"rushes_attempts\" varchar(100)\n \"rushes_yards\" varchar(100)\n \"rushes_touchdowns\" varchar(100)\n \"rushing_average_yards_per\" varchar(100)\n \"rushes_first_down\" varchar(100)\n \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n \"id\" int4 [not null, increment]\n \"sacks_against_yards\" varchar(100)\n \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n \"id\" int4 [not null, increment]\n \"touchdowns_total\" varchar(100)\n \"touchdowns_passing\" varchar(100)\n \"touchdowns_rushing\" varchar(100)\n \"touchdowns_special_teams\" varchar(100)\n \"touchdowns_defensive\" varchar(100)\n \"extra_points_attempts\" varchar(100)\n \"extra_points_made\" varchar(100)\n \"extra_points_missed\" varchar(100)\n \"extra_points_blocked\" varchar(100)\n \"field_goal_attempts\" varchar(100)\n \"field_goals_made\" varchar(100)\n \"field_goals_missed\" varchar(100)\n \"field_goals_blocked\" varchar(100)\n \"safeties_against\" varchar(100)\n \"two_point_conversions_attempts\" varchar(100)\n \"two_point_conversions_made\" varchar(100)\n \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_punt_total\" varchar(100)\n \"returns_punt_yards\" varchar(100)\n \"returns_punt_average\" varchar(100)\n \"returns_punt_longest\" varchar(100)\n \"returns_punt_touchdown\" varchar(100)\n \"returns_kickoff_total\" varchar(100)\n \"returns_kickoff_yards\" varchar(100)\n \"returns_kickoff_average\" varchar(100)\n \"returns_kickoff_longest\" varchar(100)\n \"returns_kickoff_touchdown\" varchar(100)\n \"returns_total\" varchar(100)\n \"returns_yards\" varchar(100)\n \"punts_total\" varchar(100)\n \"punts_yards_gross\" varchar(100)\n \"punts_yards_net\" varchar(100)\n \"punts_longest\" varchar(100)\n \"punts_inside_20\" varchar(100)\n \"punts_inside_20_percentage\" varchar(100)\n \"punts_average\" varchar(100)\n \"punts_blocked\" varchar(100)\n \"touchbacks_total\" varchar(100)\n \"touchbacks_total_percentage\" varchar(100)\n \"touchbacks_kickoffs\" varchar(100)\n \"touchbacks_kickoffs_percentage\" varchar(100)\n \"touchbacks_punts\" varchar(100)\n \"touchbacks_punts_percentage\" varchar(100)\n \"touchbacks_interceptions\" varchar(100)\n \"touchbacks_interceptions_percentage\" varchar(100)\n \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_pitch_id\" int4 [not null]\n \"location\" varchar(100)\n \"strength\" varchar(100)\n \"velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_play_id\" int4 [not null]\n \"sequence_number\" int4\n \"baseball_defensive_group_id\" int4\n \"umpire_call\" varchar(100)\n \"pitch_location\" varchar(100)\n \"pitch_type\" varchar(100)\n \"pitch_velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n \"ball_type\" varchar(40)\n \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"notation\" varchar(100)\n \"notation_yaml\" text\n \"baseball_defensive_group_id\" int4\n \"comment\" varchar(255)\n \"runner_on_first_advance\" int4\n \"runner_on_second_advance\" int4\n \"runner_on_third_advance\" int4\n \"outs_recorded\" int4\n \"rbi\" int4\n \"runs_scored\" int4\n \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"sequence_number\" int4\n \"person_type\" varchar(100)\n \"person_original_id\" int4\n \"person_original_position_id\" int4\n \"person_original_lineup_slot\" int4\n \"person_replacing_id\" int4\n \"person_replacing_position_id\" int4\n \"person_replacing_lineup_slot\" int4\n \"substitution_reason\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n \"id\" int4 [not null, increment]\n \"baseball_defensive_group_id\" int4 [not null]\n \"player_id\" int4 [not null]\n \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"double_plays\" int4\n \"triple_plays\" int4\n \"putouts\" int4\n \"assists\" int4\n \"errors\" int4\n \"fielding_percentage\" numeric\n \"defensive_average\" numeric\n \"errors_passed_ball\" int4\n \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"at_bat_number\" int4\n \"inning_value\" int4\n \"inning_half\" varchar(100)\n \"outs\" int4\n \"balls\" int4\n \"strikes\" int4\n \"runner_on_first_id\" int4\n \"runner_on_second_id\" int4\n \"runner_on_third_id\" int4\n \"runner_on_first\" int2\n \"runner_on_second\" int2\n \"runner_on_third\" int2\n \"runs_this_inning_half\" int4\n \"pitcher_id\" int4\n \"batter_id\" int4\n \"batter_side\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"average\" numeric\n \"runs_scored\" int4\n \"at_bats\" int4\n \"hits\" int4\n \"rbi\" int4\n \"total_bases\" int4\n \"slugging_percentage\" numeric\n \"bases_on_balls\" int4\n \"strikeouts\" int4\n \"left_on_base\" int4\n \"left_in_scoring_position\" int4\n \"singles\" int4\n \"doubles\" int4\n \"triples\" int4\n \"home_runs\" int4\n \"grand_slams\" int4\n \"at_bats_per_rbi\" numeric\n \"plate_appearances_per_rbi\" numeric\n \"at_bats_per_home_run\" numeric\n \"plate_appearances_per_home_run\" numeric\n \"sac_flies\" int4\n \"sac_bunts\" int4\n \"grounded_into_double_play\" int4\n \"moved_up\" int4\n \"on_base_percentage\" numeric\n \"stolen_bases\" int4\n \"stolen_bases_caught\" int4\n \"stolen_bases_average\" numeric\n \"hit_by_pitch\" int4\n \"defensive_interferance_reaches\" int4\n \"on_base_plus_slugging\" numeric\n \"plate_appearances\" int4\n \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n \"id\" int4 [not null, increment]\n \"runs_allowed\" int4\n \"singles_allowed\" int4\n \"doubles_allowed\" int4\n \"triples_allowed\" int4\n \"home_runs_allowed\" int4\n \"innings_pitched\" varchar(20)\n \"hits\" int4\n \"earned_runs\" int4\n \"unearned_runs\" int4\n \"bases_on_balls\" int4\n \"bases_on_balls_intentional\" int4\n \"strikeouts\" int4\n \"strikeout_to_bb_ratio\" numeric\n \"number_of_pitches\" int4\n \"era\" numeric\n \"inherited_runners_scored\" int4\n \"pick_offs\" int4\n \"errors_hit_with_pitch\" int4\n \"errors_wild_pitch\" int4\n \"balks\" int4\n \"wins\" int4\n \"losses\" int4\n \"saves\" int4\n \"shutouts\" int4\n \"games_complete\" int4\n \"games_finished\" int4\n \"winning_percentage\" numeric\n \"event_credit\" varchar(40)\n \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"steals_total\" varchar(100)\n \"steals_per_game\" varchar(100)\n \"blocks_total\" varchar(100)\n \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"field_goals_made\" int4\n \"field_goals_attempted\" int4\n \"field_goals_percentage\" varchar(100)\n \"field_goals_per_game\" varchar(100)\n \"field_goals_attempted_per_game\" varchar(100)\n \"field_goals_percentage_adjusted\" varchar(100)\n \"three_pointers_made\" int4\n \"three_pointers_attempted\" int4\n \"three_pointers_percentage\" varchar(100)\n \"three_pointers_per_game\" varchar(100)\n \"three_pointers_attempted_per_game\" varchar(100)\n \"free_throws_made\" varchar(100)\n \"free_throws_attempted\" varchar(100)\n \"free_throws_percentage\" varchar(100)\n \"free_throws_per_game\" varchar(100)\n \"free_throws_attempted_per_game\" varchar(100)\n \"points_scored_total\" varchar(100)\n \"points_scored_per_game\" varchar(100)\n \"assists_total\" varchar(100)\n \"assists_per_game\" varchar(100)\n \"turnovers_total\" varchar(100)\n \"turnovers_per_game\" varchar(100)\n \"points_scored_off_turnovers\" varchar(100)\n \"points_scored_in_paint\" varchar(100)\n \"points_scored_on_second_chance\" varchar(100)\n \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n \"id\" int4 [not null, increment]\n \"rebounds_total\" varchar(100)\n \"rebounds_per_game\" varchar(100)\n \"rebounds_defensive\" varchar(100)\n \"rebounds_offensive\" varchar(100)\n \"team_rebounds_total\" varchar(100)\n \"team_rebounds_per_game\" varchar(100)\n \"team_rebounds_defensive\" varchar(100)\n \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n \"id\" int4 [not null, increment]\n \"timeouts_left\" varchar(100)\n \"largest_lead\" varchar(100)\n \"fouls_total\" varchar(100)\n \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n \"id\" int4 [not null, increment]\n \"time_played_event\" varchar(40)\n \"time_played_total\" varchar(40)\n \"time_played_event_average\" varchar(40)\n \"events_played\" int4\n \"events_started\" int4\n \"position_id\" int4\n}\n\nTable \"core_stats\" {\n \"id\" int4 [not null, increment]\n \"score\" varchar(100)\n \"score_opposing\" varchar(100)\n \"score_attempts\" varchar(100)\n \"score_attempts_opposing\" varchar(100)\n \"score_percentage\" varchar(100)\n \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n \"id\" int4 [not null, increment]\n \"language\" varchar(100) [not null]\n \"entity_type\" varchar(100) [not null]\n \"entity_id\" int4 [not null]\n \"full_name\" varchar(100)\n \"first_name\" varchar(100)\n \"middle_name\" varchar(100)\n \"last_name\" varchar(100)\n \"alias\" varchar(100)\n \"abbreviation\" varchar(100)\n \"short_name\" varchar(100)\n \"prefix\" varchar(20)\n \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n \"id\" int4 [not null, increment]\n \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"sportsml\" varchar(200)\n \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n \"id\" int4 [not null, increment]\n \"fixture_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"name\" varchar(100)\n \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n \"id\" int4 [not null, increment]\n \"document_fixture_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"latest_document_id\" int4 [not null]\n \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n \"id\" int4 [not null, increment]\n \"document_package_id\" int4 [not null]\n \"rank\" varchar(100)\n \"document_id\" int4 [not null]\n \"headline\" varchar(100)\n \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n \"id\" int4 [not null, increment]\n \"package_key\" varchar(100)\n \"package_name\" varchar(100)\n \"date_time\" date\n}\n\nTable \"documents\" {\n \"id\" int4 [not null, increment]\n \"doc_id\" varchar(75) [not null]\n \"publisher_id\" int4 [not null]\n \"date_time\" timestamp\n \"title\" varchar(255)\n \"language\" varchar(100)\n \"priority\" varchar(100)\n \"revision_id\" varchar(75)\n \"stats_coverage\" varchar(100)\n \"document_fixture_id\" int4 [not null]\n \"source_id\" int4\n \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"media_id\" int4 [not null]\n \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n \"id\" int4 [not null, increment]\n \"event_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"site_id\" int4\n \"site_alignment\" varchar(100)\n \"event_status\" varchar(100)\n \"duration\" varchar(100)\n \"attendance\" varchar(100)\n \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n \"event_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n \"event_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n \"event_id\" int4 [not null]\n \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"play_result\" varchar(100)\n \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_power_play_allowed\" varchar(100)\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_power_play_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"penalty_killing_amount\" varchar(100)\n \"penalty_killing_percentage\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"takeaways\" varchar(100)\n \"shutouts\" varchar(100)\n \"minutes_penalty_killing\" varchar(100)\n \"hits\" varchar(100)\n \"goals_empty_net_allowed\" varchar(100)\n \"goals_short_handed_allowed\" varchar(100)\n \"goals_shootout_allowed\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_power_play\" varchar(100)\n \"goals_short_handed\" varchar(100)\n \"goals_even_strength\" varchar(100)\n \"goals_empty_net\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_penalty_shot\" varchar(100)\n \"assists\" varchar(100)\n \"points\" varchar(100)\n \"power_play_amount\" varchar(100)\n \"power_play_percentage\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(100)\n \"shots_penalty_shot_percentage\" varchar(100)\n \"giveaways\" varchar(100)\n \"minutes_power_play\" varchar(100)\n \"faceoff_wins\" varchar(100)\n \"faceoff_losses\" varchar(100)\n \"faceoff_win_percentage\" varchar(100)\n \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n \"id\" int4 [not null, increment]\n \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"injury_status\" varchar(100)\n \"injury_type\" varchar(100)\n \"injury_comment\" varchar(100)\n \"disabled_list\" varchar(100)\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n \"season_id\" int4\n \"phase_type\" varchar(100)\n \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n \"id\" int4 [not null, increment]\n \"key_id\" int4 [not null]\n \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n \"id\" int4 [not null, increment]\n \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n \"id\" int4 [not null, increment]\n \"revision_id\" varchar(75) [not null]\n \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n \"id\" int4 [not null, increment]\n \"timezone\" varchar(100)\n \"latitude\" varchar(100)\n \"longitude\" varchar(100)\n \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n \"id\" int4 [not null, increment]\n \"object_id\" int4\n \"source_id\" int4\n \"revision_id\" int4\n \"media_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"date_time\" varchar(100)\n \"credit_id\" int4 [not null]\n \"db_loading_date_time\" timestamp\n \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"caption_type\" varchar(100)\n \"caption\" varchar(100)\n \"caption_author_id\" int4 [not null]\n \"language\" varchar(100)\n \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"object\" varchar(100)\n \"format\" varchar(100)\n \"mime_type\" varchar(100)\n \"height\" varchar(100)\n \"width\" varchar(100)\n \"duration\" varchar(100)\n \"file_size\" varchar(100)\n \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n \"id\" int4 [not null, increment]\n \"keyword\" varchar(100)\n \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"lap\" varchar(100)\n \"laps_remaining\" varchar(100)\n \"time_elapsed\" varchar(100)\n \"flag_state\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n \"id\" int4 [not null, increment]\n \"grid\" varchar(100)\n \"pole_position\" varchar(100)\n \"pole_wins\" varchar(100)\n \"qualifying_speed\" varchar(100)\n \"qualifying_speed_units\" varchar(100)\n \"qualifying_time\" varchar(100)\n \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n \"id\" int4 [not null, increment]\n \"time_behind_leader\" varchar(100)\n \"laps_behind_leader\" varchar(100)\n \"time_ahead_follower\" varchar(100)\n \"laps_ahead_follower\" varchar(100)\n \"time\" varchar(100)\n \"points\" varchar(100)\n \"points_rookie\" varchar(100)\n \"bonus\" varchar(100)\n \"laps_completed\" varchar(100)\n \"laps_leading_total\" varchar(100)\n \"distance_leading\" varchar(100)\n \"distance_completed\" varchar(100)\n \"distance_units\" varchar(40)\n \"speed_average\" varchar(40)\n \"speed_units\" varchar(40)\n \"status\" varchar(40)\n \"finishes_top_5\" varchar(40)\n \"finishes_top_10\" varchar(40)\n \"starts\" varchar(40)\n \"finishes\" varchar(40)\n \"non_finishes\" varchar(40)\n \"wins\" varchar(40)\n \"races_leading\" varchar(40)\n \"money\" varchar(40)\n \"money_units\" varchar(40)\n \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n \"id\" int4 [not null, increment]\n \"standing_subgroup_id\" int4 [not null]\n \"outcome_holder_type\" varchar(100)\n \"outcome_holder_id\" int4\n \"rank\" varchar(100)\n \"wins\" varchar(100)\n \"losses\" varchar(100)\n \"ties\" varchar(100)\n \"undecideds\" varchar(100)\n \"winning_percentage\" varchar(100)\n \"points_scored_for\" varchar(100)\n \"points_scored_against\" varchar(100)\n \"points_difference\" varchar(100)\n \"standing_points\" varchar(100)\n \"streak_type\" varchar(100)\n \"streak_duration\" varchar(100)\n \"streak_total\" varchar(100)\n \"streak_start\" date\n \"streak_end\" date\n}\n\nTable \"participants_events\" {\n \"id\" int4 [not null, increment]\n \"participant_type\" varchar(100) [not null]\n \"participant_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"alignment\" varchar(100)\n \"score\" varchar(100)\n \"event_outcome\" varchar(100)\n \"rank\" int4\n}\n\nTable \"periods\" {\n \"id\" int4 [not null, increment]\n \"participant_event_id\" int4 [not null]\n \"period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"status\" varchar(100)\n \"health\" varchar(100)\n \"weight\" varchar(100)\n \"role_id\" int4\n \"position_id\" int4\n \"team_id\" int4\n \"lineup_slot\" int4\n \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"membership_type\" varchar(40) [not null]\n \"membership_id\" int4 [not null]\n \"role_id\" int4\n \"role_status\" varchar(40)\n \"phase_status\" varchar(40)\n \"uniform_number\" varchar(20)\n \"regular_position_id\" int4\n \"regular_position_depth\" varchar(40)\n \"height\" varchar(100)\n \"weight\" varchar(100)\n \"start_date_time\" timestamp\n \"start_season_id\" int4\n \"end_date_time\" timestamp\n \"end_season_id\" int4\n \"entry_reason\" varchar(40)\n \"exit_reason\" varchar(40)\n \"selection_level\" int4\n \"selection_sublevel\" int4\n \"selection_overall\" int4\n}\n\nTable \"persons\" {\n \"id\" int4 [not null, increment]\n \"person_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"gender\" varchar(20)\n \"birth_date\" varchar(30)\n \"death_date\" varchar(30)\n \"birth_location_id\" int4\n \"hometown_location_id\" int4\n \"residence_location_id\" int4\n \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n \"person_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n \"person_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n \"id\" int4 [not null, increment]\n \"publisher_key\" varchar(100) [not null]\n \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n \"id\" int4 [not null, increment]\n \"role_key\" varchar(100) [not null]\n \"role_name\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n \"id\" int4 [not null, increment]\n \"season_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"league_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n \"id\" int4 [not null, increment]\n \"site_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"goals_against_total\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"catches_punches\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_shootout_total\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"minutes_elapsed\" varchar(100)\n \"period_minute_elapsed\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n \"id\" int4 [not null, increment]\n \"fouls_suffered\" varchar(100)\n \"fouls_commited\" varchar(100)\n \"cautions_total\" varchar(100)\n \"cautions_pending\" varchar(100)\n \"caution_points_total\" varchar(100)\n \"caution_points_pending\" varchar(100)\n \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_total\" varchar(100)\n \"assists_game_winning\" varchar(100)\n \"assists_game_tying\" varchar(100)\n \"assists_overtime\" varchar(100)\n \"assists_total\" varchar(100)\n \"points\" varchar(100)\n \"shots_total\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_hit_frame\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_scored\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(40)\n \"shots_penalty_shot_percentage\" varchar(40)\n \"shots_shootout_taken\" varchar(40)\n \"shots_shootout_scored\" varchar(40)\n \"shots_shootout_missed\" varchar(40)\n \"shots_shootout_percentage\" varchar(40)\n \"giveaways\" varchar(40)\n \"offsides\" varchar(40)\n \"corner_kicks\" varchar(40)\n \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n \"id\" int4 [not null, increment]\n \"standing_id\" int4 [not null]\n \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"standing_type\" varchar(100)\n \"sub_season_id\" int4 [not null]\n \"last_updated\" varchar(100)\n \"duration_scope\" varchar(100)\n \"competition_scope\" varchar(100)\n \"competition_scope_id\" varchar(100)\n \"alignment_scope\" varchar(100)\n \"site_scope\" varchar(100)\n \"scoping_label\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"source\" varchar(100)\n}\n\nTable \"stats\" {\n \"id\" int4 [not null, increment]\n \"stat_repository_type\" varchar(100)\n \"stat_repository_id\" int4 [not null]\n \"stat_holder_type\" varchar(100)\n \"stat_holder_id\" int4\n \"stat_coverage_type\" varchar(100)\n \"stat_coverage_id\" int4\n \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n \"id\" int4 [not null, increment]\n \"period_id\" int4 [not null]\n \"sub_period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n \"id\" int4 [not null, increment]\n \"sub_season_key\" varchar(100) [not null]\n \"season_id\" int4 [not null]\n \"sub_season_type\" varchar(100) [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n \"id\" int4 [not null, increment]\n \"yards_per_attempt\" varchar(100)\n \"average_starting_position\" varchar(100)\n \"timeouts\" varchar(100)\n \"time_of_possession\" varchar(100)\n \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n \"id\" int4 [not null, increment]\n \"team_id\" int4 [not null]\n \"start_season_id\" int4\n \"end_season_id\" int4\n \"affiliation_id\" int4 [not null]\n \"start_date_time\" varchar(100)\n \"end_date_time\" varchar(100)\n \"phase_status\" varchar(40)\n \"role_id\" int4\n}\n\nTable \"teams\" {\n \"id\" int4 [not null, increment]\n \"team_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n \"team_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n \"team_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n \"id\" int4 [not null, increment]\n \"sub_period_id\" varchar(100)\n \"sequence_number\" varchar(100)\n \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n \"id\" int4 [not null, increment]\n \"sequence_number\" varchar(100)\n \"tennis_action_points_id\" int4\n \"landing_location\" varchar(100)\n \"swing_type\" varchar(100)\n \"result\" varchar(100)\n \"spin_type\" varchar(100)\n \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"tennis_set\" varchar(100)\n \"game\" varchar(100)\n \"server_person_id\" int4\n \"server_score\" varchar(100)\n \"receiver_person_id\" int4\n \"receiver_score\" varchar(100)\n \"service_number\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"first_service_return_points_won\" varchar(100)\n \"first_service_return_points_won_pct\" varchar(100)\n \"second_service_return_points_won\" varchar(100)\n \"second_service_return_points_won_pct\" varchar(100)\n \"return_games_played\" varchar(100)\n \"return_games_won\" varchar(100)\n \"return_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_converted\" varchar(100)\n \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n \"id\" int4 [not null, increment]\n \"services_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"aces\" varchar(100)\n \"first_services_good\" varchar(100)\n \"first_services_good_pct\" varchar(100)\n \"first_service_points_won\" varchar(100)\n \"first_service_points_won_pct\" varchar(100)\n \"second_service_points_won\" varchar(100)\n \"second_service_points_won_pct\" varchar(100)\n \"service_games_played\" varchar(100)\n \"service_games_won\" varchar(100)\n \"service_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_saved\" varchar(100)\n \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"numerator\" varchar(100)\n \"denominator\" varchar(100)\n \"prediction\" varchar(100)\n \"payout_calculation\" varchar(100)\n \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"line_value\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_value\" varchar(100)\n \"line_value_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_over\" varchar(100)\n \"line_under\" varchar(100)\n \"total\" varchar(100)\n \"total_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"temperature\" varchar(100)\n \"temperature_units\" varchar(40)\n \"humidity\" varchar(100)\n \"clouds\" varchar(100)\n \"wind_direction\" varchar(100)\n \"wind_velocity\" varchar(100)\n \"weather_code\" varchar(100)\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql"
}
}
================================================
FILE: tasks/postgres/standard/sports/participant_report_optimization/verify.py
================================================
"""
Verification script for PostgreSQL Sports Task 3: Query Performance Optimization
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.001 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.001:
return False
elif isinstance(actual, float) and isinstance(expected, float):
if abs(actual - expected) > 0.001:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE", "sports"),
"user": os.getenv("POSTGRES_USERNAME", "postgres"),
"password": os.getenv("POSTGRES_PASSWORD", "postgres")
}
def verify_report_data(conn) -> bool:
"""Verify the report table contains the expected data."""
with conn.cursor() as cur:
# Get actual results from the report table
cur.execute("""
SELECT participant_id, event_count, stat_count, stat_type_count, last_event_date
FROM participant_performance_report
ORDER BY participant_id
""")
actual_results = cur.fetchall()
if len(actual_results) == 0:
print("❌ Report table is empty")
return False
# Execute ground truth query
cur.execute("""
SELECT
pe.participant_id,
COUNT(pe.event_id) as event_count,
(SELECT COUNT(*) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_count,
(SELECT COUNT(DISTINCT s.stat_repository_type) FROM stats s WHERE s.stat_holder_id = pe.participant_id AND s.stat_holder_type = 'persons') as stat_type_count,
(SELECT MAX(e.start_date_time) FROM events e JOIN participants_events pe2 ON e.id = pe2.event_id WHERE pe2.participant_id = pe.participant_id) as last_event_date
FROM participants_events pe
WHERE pe.participant_id <= 50
GROUP BY pe.participant_id
ORDER BY pe.participant_id
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} report records, got {len(actual_results)}")
return False
mismatches = 0
for actual, expected in zip(actual_results, expected_results):
if not rows_match(actual, expected):
if mismatches < 5:
print(f"❌ Row mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches in report data: {mismatches}")
return False
print(f"✅ Report data is correct ({len(actual_results)} records)")
return True
def verify_performance_optimization(conn) -> bool:
"""Verify that key performance optimization indexes have been implemented."""
with conn.cursor() as cur:
print("\n🔍 Checking for critical performance indexes...")
# Check 1: participants_events.participant_id index (critical for subqueries)
cur.execute("""
SELECT indexname, indexdef
FROM pg_indexes
WHERE schemaname = 'public'
AND tablename = 'participants_events'
AND indexdef LIKE '%participant_id%'
""")
participant_indexes = cur.fetchall()
has_participant_index = len(participant_indexes) > 0
# Check 2: stats table optimization (critical for subquery filtering)
cur.execute("""
SELECT indexname, indexdef
FROM pg_indexes
WHERE schemaname = 'public'
AND tablename = 'stats'
AND indexdef LIKE '%stat_holder_type%'
AND indexdef LIKE '%stat_holder_id%'
""")
stats_indexes = cur.fetchall()
has_stats_index = len(stats_indexes) > 0
# Report findings
critical_indexes_found = 0
if has_participant_index:
print("✅ Found participant filtering index on participants_events.participant_id")
critical_indexes_found += 1
else:
print("❌ Missing critical index on participants_events.participant_id")
if has_stats_index:
print("✅ Found subquery optimization index on stats table")
critical_indexes_found += 1
else:
print("❌ Missing critical index on stats table")
# Must have both critical indexes for this subquery-heavy query
if critical_indexes_found >= 2:
print(f"\n✅ Performance optimization: PASS ({critical_indexes_found}/2 critical indexes found)")
return True
else:
print(f"\n❌ Performance optimization: FAIL ({critical_indexes_found}/2 critical indexes found)")
print(" Create these critical indexes:")
print(" - CREATE INDEX ON participants_events(participant_id);")
print(" - CREATE INDEX ON stats(stat_holder_type, stat_holder_id);")
return False
def main():
"""Main verification function."""
print("=" * 50)
print("Verifying Sports Task 3: Query Performance Optimization")
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all components
success = (
verify_report_data(conn) and
verify_performance_optimization(conn)
)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/sports/team_roster_management/description.md
================================================
# Team Roster Management Operations
## Background
You need to manage team rosters for the upcoming season, including player transfers, injury tracking, and performance evaluations.
## Requirements
Complete the following 5 operations in order:
### 1. Set Up Player Performance Tracking
Create a table called `player_evaluation` with the following structure:
- performance_id (serial primary key)
- person_id (integer not null, references persons(id))
- batting_avg (decimal)
- home_runs (integer)
- rbis (integer)
- games_played (integer)
- performance_score (decimal)
- evaluation_date (date)
Add constraint: CHECK (batting_avg BETWEEN 0 AND 1)
### 2. Load Historical Player Statistics
Insert player performance data into `player_evaluation`:
- Select all players who have offensive statistics
- Calculate batting_avg as hits/at_bats (handle division by zero)
- Sum up home_runs, rbi from baseball_offensive_stats
- Count games_played from person_event_metadata
- Calculate performance_score as: (batting_avg * 1000) + (home_runs * 5) + (rbi * 2)
- Only include players with at least 10 games played
- Set evaluation_date to '2024-01-01'
### 3. Track Player Health Status
Create a table called `player_injury_status`:
- status_id (serial primary key)
- person_id (integer unique not null)
- injury_count (integer default 0)
- last_injury_date (date)
- current_status (varchar check in ('healthy', 'injured', 'recovering'))
Insert data by:
- Including all players from player_evaluation
- Count injuries from injury_phases for each player
- Get the most recent injury start_date as last_injury_date
- Set current_status: 'injured' if injury has no end_date, otherwise 'healthy'
### 4. Adjust Scores Based on Health
Update `player_evaluation` to reduce performance scores for injured players:
- Reduce performance_score by 20% for players with current_status = 'injured'
- Reduce performance_score by 10% for players with injury_count > 2
- Set minimum performance_score to 0 (no negative scores)
### 5. Generate Performance Summary Report
Create a summary table called `team_performance_summary`:
- summary_id (serial primary key)
- metric_name (varchar unique)
- metric_value (decimal)
Insert the following metrics:
- 'total_players' - count of players in player_evaluation
- 'avg_batting_average' - average batting_avg
- 'total_home_runs' - sum of all home_runs
- 'avg_performance_score' - average performance_score
- 'injured_player_count' - count of injured players
- 'healthy_player_count' - count of healthy players
## Important Notes
- Handle NULL values appropriately (treat as 0 where needed)
- Ensure foreign key constraints are properly set
- Do NOT use ROUND functions in calculations
- Use COALESCE to handle NULL values in calculations
================================================
FILE: tasks/postgres/standard/sports/team_roster_management/meta.json
================================================
{
"task_id": "team_roster_management",
"task_name": "Team Roster Management",
"category_id": "sports",
"category_name": "Sports",
"description": "Manage team rosters with player transfers, injury tracking, performance evaluations, and health status adjustments.",
"author": "Lingxiao Du",
"created_at": "2025-08-18",
"difficulty": "L3",
"tags": [
"schema design",
"data migration",
"statistical aggregation"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"addresses\" {\n \"id\" int4 [not null, increment]\n \"location_id\" int4 [not null]\n \"language\" varchar(100)\n \"suite\" varchar(100)\n \"floor\" varchar(100)\n \"building\" varchar(100)\n \"street_number\" varchar(100)\n \"street_prefix\" varchar(100)\n \"street\" varchar(100)\n \"street_suffix\" varchar(100)\n \"neighborhood\" varchar(100)\n \"district\" varchar(100)\n \"locality\" varchar(100)\n \"county\" varchar(100)\n \"region\" varchar(100)\n \"postal_code\" varchar(100)\n \"country\" varchar(100)\n}\n\nTable \"affiliation_phases\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"ancestor_affiliation_id\" int4\n \"start_season_id\" int4\n \"start_date_time\" timestamp\n \"end_season_id\" int4\n \"end_date_time\" timestamp\n}\n\nTable \"affiliations\" {\n \"id\" int4 [not null, increment]\n \"affiliation_key\" varchar(100) [not null]\n \"affiliation_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n}\n\nTable \"affiliations_documents\" {\n \"affiliation_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"affiliations_events\" {\n \"affiliation_id\" int4 [not null]\n \"event_id\" int4 [not null]\n}\n\nTable \"affiliations_media\" {\n \"affiliation_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"american_football_action_participants\" {\n \"id\" int4 [not null, increment]\n \"american_football_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"score_type\" varchar(100)\n \"field_line\" int4\n \"yardage\" int4\n \"score_credit\" int4\n \"yards_gained\" int4\n}\n\nTable \"american_football_action_plays\" {\n \"id\" int4 [not null, increment]\n \"american_football_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"drive_result\" varchar(100)\n \"points\" int4\n \"comment\" varchar(255)\n}\n\nTable \"american_football_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"tackles_total\" varchar(100)\n \"tackles_solo\" varchar(100)\n \"tackles_assists\" varchar(100)\n \"interceptions_total\" varchar(100)\n \"interceptions_yards\" varchar(100)\n \"interceptions_average\" varchar(100)\n \"interceptions_longest\" varchar(100)\n \"interceptions_touchdown\" varchar(100)\n \"quarterback_hurries\" varchar(100)\n \"sacks_total\" varchar(100)\n \"sacks_yards\" varchar(100)\n \"passes_defensed\" varchar(100)\n}\n\nTable \"american_football_down_progress_stats\" {\n \"id\" int4 [not null, increment]\n \"first_downs_total\" varchar(100)\n \"first_downs_pass\" varchar(100)\n \"first_downs_run\" varchar(100)\n \"first_downs_penalty\" varchar(100)\n \"conversions_third_down\" varchar(100)\n \"conversions_third_down_attempts\" varchar(100)\n \"conversions_third_down_percentage\" varchar(100)\n \"conversions_fourth_down\" varchar(100)\n \"conversions_fourth_down_attempts\" varchar(100)\n \"conversions_fourth_down_percentage\" varchar(100)\n}\n\nTable \"american_football_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"period_value\" int4\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"clock_state\" varchar(100)\n \"down\" int4\n \"team_in_possession_id\" int4\n \"distance_for_1st_down\" int4\n \"field_side\" varchar(100)\n \"field_line\" int4\n \"context\" varchar(40)\n}\n\nTable \"american_football_fumbles_stats\" {\n \"id\" int4 [not null, increment]\n \"fumbles_committed\" varchar(100)\n \"fumbles_forced\" varchar(100)\n \"fumbles_recovered\" varchar(100)\n \"fumbles_lost\" varchar(100)\n \"fumbles_yards_gained\" varchar(100)\n \"fumbles_own_committed\" varchar(100)\n \"fumbles_own_recovered\" varchar(100)\n \"fumbles_own_lost\" varchar(100)\n \"fumbles_own_yards_gained\" varchar(100)\n \"fumbles_opposing_committed\" varchar(100)\n \"fumbles_opposing_recovered\" varchar(100)\n \"fumbles_opposing_lost\" varchar(100)\n \"fumbles_opposing_yards_gained\" varchar(100)\n}\n\nTable \"american_football_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"offensive_plays_yards\" varchar(100)\n \"offensive_plays_number\" varchar(100)\n \"offensive_plays_average_yards_per\" varchar(100)\n \"possession_duration\" varchar(100)\n \"turnovers_giveaway\" varchar(100)\n}\n\nTable \"american_football_passing_stats\" {\n \"id\" int4 [not null, increment]\n \"passes_attempts\" varchar(100)\n \"passes_completions\" varchar(100)\n \"passes_percentage\" varchar(100)\n \"passes_yards_gross\" varchar(100)\n \"passes_yards_net\" varchar(100)\n \"passes_yards_lost\" varchar(100)\n \"passes_touchdowns\" varchar(100)\n \"passes_touchdowns_percentage\" varchar(100)\n \"passes_interceptions\" varchar(100)\n \"passes_interceptions_percentage\" varchar(100)\n \"passes_longest\" varchar(100)\n \"passes_average_yards_per\" varchar(100)\n \"passer_rating\" varchar(100)\n \"receptions_total\" varchar(100)\n \"receptions_yards\" varchar(100)\n \"receptions_touchdowns\" varchar(100)\n \"receptions_first_down\" varchar(100)\n \"receptions_longest\" varchar(100)\n \"receptions_average_yards_per\" varchar(100)\n}\n\nTable \"american_football_penalties_stats\" {\n \"id\" int4 [not null, increment]\n \"penalties_total\" varchar(100)\n \"penalty_yards\" varchar(100)\n \"penalty_first_downs\" varchar(100)\n}\n\nTable \"american_football_rushing_stats\" {\n \"id\" int4 [not null, increment]\n \"rushes_attempts\" varchar(100)\n \"rushes_yards\" varchar(100)\n \"rushes_touchdowns\" varchar(100)\n \"rushing_average_yards_per\" varchar(100)\n \"rushes_first_down\" varchar(100)\n \"rushes_longest\" varchar(100)\n}\n\nTable \"american_football_sacks_against_stats\" {\n \"id\" int4 [not null, increment]\n \"sacks_against_yards\" varchar(100)\n \"sacks_against_total\" varchar(100)\n}\n\nTable \"american_football_scoring_stats\" {\n \"id\" int4 [not null, increment]\n \"touchdowns_total\" varchar(100)\n \"touchdowns_passing\" varchar(100)\n \"touchdowns_rushing\" varchar(100)\n \"touchdowns_special_teams\" varchar(100)\n \"touchdowns_defensive\" varchar(100)\n \"extra_points_attempts\" varchar(100)\n \"extra_points_made\" varchar(100)\n \"extra_points_missed\" varchar(100)\n \"extra_points_blocked\" varchar(100)\n \"field_goal_attempts\" varchar(100)\n \"field_goals_made\" varchar(100)\n \"field_goals_missed\" varchar(100)\n \"field_goals_blocked\" varchar(100)\n \"safeties_against\" varchar(100)\n \"two_point_conversions_attempts\" varchar(100)\n \"two_point_conversions_made\" varchar(100)\n \"touchbacks_total\" varchar(100)\n}\n\nTable \"american_football_special_teams_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_punt_total\" varchar(100)\n \"returns_punt_yards\" varchar(100)\n \"returns_punt_average\" varchar(100)\n \"returns_punt_longest\" varchar(100)\n \"returns_punt_touchdown\" varchar(100)\n \"returns_kickoff_total\" varchar(100)\n \"returns_kickoff_yards\" varchar(100)\n \"returns_kickoff_average\" varchar(100)\n \"returns_kickoff_longest\" varchar(100)\n \"returns_kickoff_touchdown\" varchar(100)\n \"returns_total\" varchar(100)\n \"returns_yards\" varchar(100)\n \"punts_total\" varchar(100)\n \"punts_yards_gross\" varchar(100)\n \"punts_yards_net\" varchar(100)\n \"punts_longest\" varchar(100)\n \"punts_inside_20\" varchar(100)\n \"punts_inside_20_percentage\" varchar(100)\n \"punts_average\" varchar(100)\n \"punts_blocked\" varchar(100)\n \"touchbacks_total\" varchar(100)\n \"touchbacks_total_percentage\" varchar(100)\n \"touchbacks_kickoffs\" varchar(100)\n \"touchbacks_kickoffs_percentage\" varchar(100)\n \"touchbacks_punts\" varchar(100)\n \"touchbacks_punts_percentage\" varchar(100)\n \"touchbacks_interceptions\" varchar(100)\n \"touchbacks_interceptions_percentage\" varchar(100)\n \"fair_catches\" varchar(100)\n}\n\nTable \"baseball_action_contact_details\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_pitch_id\" int4 [not null]\n \"location\" varchar(100)\n \"strength\" varchar(100)\n \"velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n}\n\nTable \"baseball_action_pitches\" {\n \"id\" int4 [not null, increment]\n \"baseball_action_play_id\" int4 [not null]\n \"sequence_number\" int4\n \"baseball_defensive_group_id\" int4\n \"umpire_call\" varchar(100)\n \"pitch_location\" varchar(100)\n \"pitch_type\" varchar(100)\n \"pitch_velocity\" int4\n \"comment\" text\n \"trajectory_coordinates\" varchar(100)\n \"trajectory_formula\" varchar(100)\n \"ball_type\" varchar(40)\n \"strike_type\" varchar(40)\n}\n\nTable \"baseball_action_plays\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"notation\" varchar(100)\n \"notation_yaml\" text\n \"baseball_defensive_group_id\" int4\n \"comment\" varchar(255)\n \"runner_on_first_advance\" int4\n \"runner_on_second_advance\" int4\n \"runner_on_third_advance\" int4\n \"outs_recorded\" int4\n \"rbi\" int4\n \"runs_scored\" int4\n \"earned_runs_scored\" varchar(100)\n}\n\nTable \"baseball_action_substitutions\" {\n \"id\" int4 [not null, increment]\n \"baseball_event_state_id\" int4 [not null]\n \"sequence_number\" int4\n \"person_type\" varchar(100)\n \"person_original_id\" int4\n \"person_original_position_id\" int4\n \"person_original_lineup_slot\" int4\n \"person_replacing_id\" int4\n \"person_replacing_position_id\" int4\n \"person_replacing_lineup_slot\" int4\n \"substitution_reason\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"baseball_defensive_group\" {\n \"id\" int4 [not null, increment]\n}\n\nTable \"baseball_defensive_players\" {\n \"id\" int4 [not null, increment]\n \"baseball_defensive_group_id\" int4 [not null]\n \"player_id\" int4 [not null]\n \"position_id\" int4 [not null]\n}\n\nTable \"baseball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"double_plays\" int4\n \"triple_plays\" int4\n \"putouts\" int4\n \"assists\" int4\n \"errors\" int4\n \"fielding_percentage\" numeric\n \"defensive_average\" numeric\n \"errors_passed_ball\" int4\n \"errors_catchers_interference\" int4\n}\n\nTable \"baseball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int2\n \"sequence_number\" int4\n \"at_bat_number\" int4\n \"inning_value\" int4\n \"inning_half\" varchar(100)\n \"outs\" int4\n \"balls\" int4\n \"strikes\" int4\n \"runner_on_first_id\" int4\n \"runner_on_second_id\" int4\n \"runner_on_third_id\" int4\n \"runner_on_first\" int2\n \"runner_on_second\" int2\n \"runner_on_third\" int2\n \"runs_this_inning_half\" int4\n \"pitcher_id\" int4\n \"batter_id\" int4\n \"batter_side\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"baseball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"average\" numeric\n \"runs_scored\" int4\n \"at_bats\" int4\n \"hits\" int4\n \"rbi\" int4\n \"total_bases\" int4\n \"slugging_percentage\" numeric\n \"bases_on_balls\" int4\n \"strikeouts\" int4\n \"left_on_base\" int4\n \"left_in_scoring_position\" int4\n \"singles\" int4\n \"doubles\" int4\n \"triples\" int4\n \"home_runs\" int4\n \"grand_slams\" int4\n \"at_bats_per_rbi\" numeric\n \"plate_appearances_per_rbi\" numeric\n \"at_bats_per_home_run\" numeric\n \"plate_appearances_per_home_run\" numeric\n \"sac_flies\" int4\n \"sac_bunts\" int4\n \"grounded_into_double_play\" int4\n \"moved_up\" int4\n \"on_base_percentage\" numeric\n \"stolen_bases\" int4\n \"stolen_bases_caught\" int4\n \"stolen_bases_average\" numeric\n \"hit_by_pitch\" int4\n \"defensive_interferance_reaches\" int4\n \"on_base_plus_slugging\" numeric\n \"plate_appearances\" int4\n \"hits_extra_base\" int4\n}\n\nTable \"baseball_pitching_stats\" {\n \"id\" int4 [not null, increment]\n \"runs_allowed\" int4\n \"singles_allowed\" int4\n \"doubles_allowed\" int4\n \"triples_allowed\" int4\n \"home_runs_allowed\" int4\n \"innings_pitched\" varchar(20)\n \"hits\" int4\n \"earned_runs\" int4\n \"unearned_runs\" int4\n \"bases_on_balls\" int4\n \"bases_on_balls_intentional\" int4\n \"strikeouts\" int4\n \"strikeout_to_bb_ratio\" numeric\n \"number_of_pitches\" int4\n \"era\" numeric\n \"inherited_runners_scored\" int4\n \"pick_offs\" int4\n \"errors_hit_with_pitch\" int4\n \"errors_wild_pitch\" int4\n \"balks\" int4\n \"wins\" int4\n \"losses\" int4\n \"saves\" int4\n \"shutouts\" int4\n \"games_complete\" int4\n \"games_finished\" int4\n \"winning_percentage\" numeric\n \"event_credit\" varchar(40)\n \"save_credit\" varchar(40)\n}\n\nTable \"basketball_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"steals_total\" varchar(100)\n \"steals_per_game\" varchar(100)\n \"blocks_total\" varchar(100)\n \"blocks_per_game\" varchar(100)\n}\n\nTable \"basketball_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"basketball_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"field_goals_made\" int4\n \"field_goals_attempted\" int4\n \"field_goals_percentage\" varchar(100)\n \"field_goals_per_game\" varchar(100)\n \"field_goals_attempted_per_game\" varchar(100)\n \"field_goals_percentage_adjusted\" varchar(100)\n \"three_pointers_made\" int4\n \"three_pointers_attempted\" int4\n \"three_pointers_percentage\" varchar(100)\n \"three_pointers_per_game\" varchar(100)\n \"three_pointers_attempted_per_game\" varchar(100)\n \"free_throws_made\" varchar(100)\n \"free_throws_attempted\" varchar(100)\n \"free_throws_percentage\" varchar(100)\n \"free_throws_per_game\" varchar(100)\n \"free_throws_attempted_per_game\" varchar(100)\n \"points_scored_total\" varchar(100)\n \"points_scored_per_game\" varchar(100)\n \"assists_total\" varchar(100)\n \"assists_per_game\" varchar(100)\n \"turnovers_total\" varchar(100)\n \"turnovers_per_game\" varchar(100)\n \"points_scored_off_turnovers\" varchar(100)\n \"points_scored_in_paint\" varchar(100)\n \"points_scored_on_second_chance\" varchar(100)\n \"points_scored_on_fast_break\" varchar(100)\n}\n\nTable \"basketball_rebounding_stats\" {\n \"id\" int4 [not null, increment]\n \"rebounds_total\" varchar(100)\n \"rebounds_per_game\" varchar(100)\n \"rebounds_defensive\" varchar(100)\n \"rebounds_offensive\" varchar(100)\n \"team_rebounds_total\" varchar(100)\n \"team_rebounds_per_game\" varchar(100)\n \"team_rebounds_defensive\" varchar(100)\n \"team_rebounds_offensive\" varchar(100)\n}\n\nTable \"basketball_team_stats\" {\n \"id\" int4 [not null, increment]\n \"timeouts_left\" varchar(100)\n \"largest_lead\" varchar(100)\n \"fouls_total\" varchar(100)\n \"turnover_margin\" varchar(100)\n}\n\nTable \"bookmakers\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"core_person_stats\" {\n \"id\" int4 [not null, increment]\n \"time_played_event\" varchar(40)\n \"time_played_total\" varchar(40)\n \"time_played_event_average\" varchar(40)\n \"events_played\" int4\n \"events_started\" int4\n \"position_id\" int4\n}\n\nTable \"core_stats\" {\n \"id\" int4 [not null, increment]\n \"score\" varchar(100)\n \"score_opposing\" varchar(100)\n \"score_attempts\" varchar(100)\n \"score_attempts_opposing\" varchar(100)\n \"score_percentage\" varchar(100)\n \"score_percentage_opposing\" varchar(100)\n}\n\nTable \"db_info\" {\n \"version\" varchar(100) [not null, default: 16]\n}\n\nTable \"display_names\" {\n \"id\" int4 [not null, increment]\n \"language\" varchar(100) [not null]\n \"entity_type\" varchar(100) [not null]\n \"entity_id\" int4 [not null]\n \"full_name\" varchar(100)\n \"first_name\" varchar(100)\n \"middle_name\" varchar(100)\n \"last_name\" varchar(100)\n \"alias\" varchar(100)\n \"abbreviation\" varchar(100)\n \"short_name\" varchar(100)\n \"prefix\" varchar(20)\n \"suffix\" varchar(20)\n}\n\nTable \"document_classes\" {\n \"id\" int4 [not null, increment]\n \"name\" varchar(100)\n}\n\nTable \"document_contents\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"sportsml\" varchar(200)\n \"abstract\" text\n}\n\nTable \"document_fixtures\" {\n \"id\" int4 [not null, increment]\n \"fixture_key\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"name\" varchar(100)\n \"document_class_id\" int4 [not null]\n}\n\nTable \"document_fixtures_events\" {\n \"id\" int4 [not null, increment]\n \"document_fixture_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"latest_document_id\" int4 [not null]\n \"last_update\" timestamp\n}\n\nTable \"document_package_entry\" {\n \"id\" int4 [not null, increment]\n \"document_package_id\" int4 [not null]\n \"rank\" varchar(100)\n \"document_id\" int4 [not null]\n \"headline\" varchar(100)\n \"short_headline\" varchar(100)\n}\n\nTable \"document_packages\" {\n \"id\" int4 [not null, increment]\n \"package_key\" varchar(100)\n \"package_name\" varchar(100)\n \"date_time\" date\n}\n\nTable \"documents\" {\n \"id\" int4 [not null, increment]\n \"doc_id\" varchar(75) [not null]\n \"publisher_id\" int4 [not null]\n \"date_time\" timestamp\n \"title\" varchar(255)\n \"language\" varchar(100)\n \"priority\" varchar(100)\n \"revision_id\" varchar(75)\n \"stats_coverage\" varchar(100)\n \"document_fixture_id\" int4 [not null]\n \"source_id\" int4\n \"db_loading_date_time\" timestamp\n}\n\nTable \"documents_media\" {\n \"id\" int4 [not null, increment]\n \"document_id\" int4 [not null]\n \"media_id\" int4 [not null]\n \"media_caption_id\" int4 [not null]\n}\n\nTable \"events\" {\n \"id\" int4 [not null, increment]\n \"event_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"site_id\" int4\n \"site_alignment\" varchar(100)\n \"event_status\" varchar(100)\n \"duration\" varchar(100)\n \"attendance\" varchar(100)\n \"last_update\" timestamp\n}\n\nTable \"events_documents\" {\n \"event_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"events_media\" {\n \"event_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"events_sub_seasons\" {\n \"event_id\" int4 [not null]\n \"sub_season_id\" int4 [not null]\n}\n\nTable \"ice_hockey_action_participants\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_action_play_id\" int4 [not null]\n \"person_id\" int4 [not null]\n \"participant_role\" varchar(100) [not null]\n \"point_credit\" int4\n}\n\nTable \"ice_hockey_action_plays\" {\n \"id\" int4 [not null, increment]\n \"ice_hockey_event_state_id\" int4 [not null]\n \"play_type\" varchar(100)\n \"score_attempt_type\" varchar(100)\n \"play_result\" varchar(100)\n \"comment\" varchar(255)\n}\n\nTable \"ice_hockey_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_power_play_allowed\" varchar(100)\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_power_play_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"penalty_killing_amount\" varchar(100)\n \"penalty_killing_percentage\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"takeaways\" varchar(100)\n \"shutouts\" varchar(100)\n \"minutes_penalty_killing\" varchar(100)\n \"hits\" varchar(100)\n \"goals_empty_net_allowed\" varchar(100)\n \"goals_short_handed_allowed\" varchar(100)\n \"goals_shootout_allowed\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n}\n\nTable \"ice_hockey_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"ice_hockey_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_power_play\" varchar(100)\n \"goals_short_handed\" varchar(100)\n \"goals_even_strength\" varchar(100)\n \"goals_empty_net\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_penalty_shot\" varchar(100)\n \"assists\" varchar(100)\n \"points\" varchar(100)\n \"power_play_amount\" varchar(100)\n \"power_play_percentage\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(100)\n \"shots_penalty_shot_percentage\" varchar(100)\n \"giveaways\" varchar(100)\n \"minutes_power_play\" varchar(100)\n \"faceoff_wins\" varchar(100)\n \"faceoff_losses\" varchar(100)\n \"faceoff_win_percentage\" varchar(100)\n \"scoring_chances\" varchar(100)\n}\n\nTable \"ice_hockey_player_stats\" {\n \"id\" int4 [not null, increment]\n \"plus_minus\" varchar(100)\n}\n\nTable \"injury_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"injury_status\" varchar(100)\n \"injury_type\" varchar(100)\n \"injury_comment\" varchar(100)\n \"disabled_list\" varchar(100)\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n \"season_id\" int4\n \"phase_type\" varchar(100)\n \"injury_side\" varchar(100)\n}\n\nTable \"key_aliases\" {\n \"id\" int4 [not null, increment]\n \"key_id\" int4 [not null]\n \"key_root_id\" int4 [not null]\n}\n\nTable \"key_roots\" {\n \"id\" int4 [not null, increment]\n \"key_type\" varchar(100)\n}\n\nTable \"latest_revisions\" {\n \"id\" int4 [not null, increment]\n \"revision_id\" varchar(75) [not null]\n \"latest_document_id\" int4 [not null]\n}\n\nTable \"locations\" {\n \"id\" int4 [not null, increment]\n \"timezone\" varchar(100)\n \"latitude\" varchar(100)\n \"longitude\" varchar(100)\n \"country_code\" varchar(100)\n}\n\nTable \"media\" {\n \"id\" int4 [not null, increment]\n \"object_id\" int4\n \"source_id\" int4\n \"revision_id\" int4\n \"media_type\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"date_time\" varchar(100)\n \"credit_id\" int4 [not null]\n \"db_loading_date_time\" timestamp\n \"creation_location_id\" int4 [not null]\n}\n\nTable \"media_captions\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"caption_type\" varchar(100)\n \"caption\" varchar(100)\n \"caption_author_id\" int4 [not null]\n \"language\" varchar(100)\n \"caption_size\" varchar(100)\n}\n\nTable \"media_contents\" {\n \"id\" int4 [not null, increment]\n \"media_id\" int4 [not null]\n \"object\" varchar(100)\n \"format\" varchar(100)\n \"mime_type\" varchar(100)\n \"height\" varchar(100)\n \"width\" varchar(100)\n \"duration\" varchar(100)\n \"file_size\" varchar(100)\n \"resolution\" varchar(100)\n}\n\nTable \"media_keywords\" {\n \"id\" int4 [not null, increment]\n \"keyword\" varchar(100)\n \"media_id\" int4 [not null]\n}\n\nTable \"motor_racing_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"lap\" varchar(100)\n \"laps_remaining\" varchar(100)\n \"time_elapsed\" varchar(100)\n \"flag_state\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"motor_racing_qualifying_stats\" {\n \"id\" int4 [not null, increment]\n \"grid\" varchar(100)\n \"pole_position\" varchar(100)\n \"pole_wins\" varchar(100)\n \"qualifying_speed\" varchar(100)\n \"qualifying_speed_units\" varchar(100)\n \"qualifying_time\" varchar(100)\n \"qualifying_position\" varchar(100)\n}\n\nTable \"motor_racing_race_stats\" {\n \"id\" int4 [not null, increment]\n \"time_behind_leader\" varchar(100)\n \"laps_behind_leader\" varchar(100)\n \"time_ahead_follower\" varchar(100)\n \"laps_ahead_follower\" varchar(100)\n \"time\" varchar(100)\n \"points\" varchar(100)\n \"points_rookie\" varchar(100)\n \"bonus\" varchar(100)\n \"laps_completed\" varchar(100)\n \"laps_leading_total\" varchar(100)\n \"distance_leading\" varchar(100)\n \"distance_completed\" varchar(100)\n \"distance_units\" varchar(40)\n \"speed_average\" varchar(40)\n \"speed_units\" varchar(40)\n \"status\" varchar(40)\n \"finishes_top_5\" varchar(40)\n \"finishes_top_10\" varchar(40)\n \"starts\" varchar(40)\n \"finishes\" varchar(40)\n \"non_finishes\" varchar(40)\n \"wins\" varchar(40)\n \"races_leading\" varchar(40)\n \"money\" varchar(40)\n \"money_units\" varchar(40)\n \"leads_total\" varchar(40)\n}\n\nTable \"outcome_totals\" {\n \"id\" int4 [not null, increment]\n \"standing_subgroup_id\" int4 [not null]\n \"outcome_holder_type\" varchar(100)\n \"outcome_holder_id\" int4\n \"rank\" varchar(100)\n \"wins\" varchar(100)\n \"losses\" varchar(100)\n \"ties\" varchar(100)\n \"undecideds\" varchar(100)\n \"winning_percentage\" varchar(100)\n \"points_scored_for\" varchar(100)\n \"points_scored_against\" varchar(100)\n \"points_difference\" varchar(100)\n \"standing_points\" varchar(100)\n \"streak_type\" varchar(100)\n \"streak_duration\" varchar(100)\n \"streak_total\" varchar(100)\n \"streak_start\" date\n \"streak_end\" date\n}\n\nTable \"participants_events\" {\n \"id\" int4 [not null, increment]\n \"participant_type\" varchar(100) [not null]\n \"participant_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"alignment\" varchar(100)\n \"score\" varchar(100)\n \"event_outcome\" varchar(100)\n \"rank\" int4\n}\n\nTable \"periods\" {\n \"id\" int4 [not null, increment]\n \"participant_event_id\" int4 [not null]\n \"period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"person_event_metadata\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"status\" varchar(100)\n \"health\" varchar(100)\n \"weight\" varchar(100)\n \"role_id\" int4\n \"position_id\" int4\n \"team_id\" int4\n \"lineup_slot\" int4\n \"lineup_slot_sequence\" int4\n}\n\nTable \"person_phases\" {\n \"id\" int4 [not null, increment]\n \"person_id\" int4 [not null]\n \"membership_type\" varchar(40) [not null]\n \"membership_id\" int4 [not null]\n \"role_id\" int4\n \"role_status\" varchar(40)\n \"phase_status\" varchar(40)\n \"uniform_number\" varchar(20)\n \"regular_position_id\" int4\n \"regular_position_depth\" varchar(40)\n \"height\" varchar(100)\n \"weight\" varchar(100)\n \"start_date_time\" timestamp\n \"start_season_id\" int4\n \"end_date_time\" timestamp\n \"end_season_id\" int4\n \"entry_reason\" varchar(40)\n \"exit_reason\" varchar(40)\n \"selection_level\" int4\n \"selection_sublevel\" int4\n \"selection_overall\" int4\n}\n\nTable \"persons\" {\n \"id\" int4 [not null, increment]\n \"person_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"gender\" varchar(20)\n \"birth_date\" varchar(30)\n \"death_date\" varchar(30)\n \"birth_location_id\" int4\n \"hometown_location_id\" int4\n \"residence_location_id\" int4\n \"death_location_id\" int4\n}\n\nTable \"persons_documents\" {\n \"person_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"persons_media\" {\n \"person_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"positions\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"abbreviation\" varchar(100) [not null]\n}\n\nTable \"publishers\" {\n \"id\" int4 [not null, increment]\n \"publisher_key\" varchar(100) [not null]\n \"publisher_name\" varchar(100)\n}\n\nTable \"roles\" {\n \"id\" int4 [not null, increment]\n \"role_key\" varchar(100) [not null]\n \"role_name\" varchar(100)\n \"comment\" varchar(100)\n}\n\nTable \"seasons\" {\n \"id\" int4 [not null, increment]\n \"season_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"league_id\" int4 [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"sites\" {\n \"id\" int4 [not null, increment]\n \"site_key\" int4 [not null]\n \"publisher_id\" int4 [not null]\n \"location_id\" int4\n}\n\nTable \"soccer_defensive_stats\" {\n \"id\" int4 [not null, increment]\n \"shots_penalty_shot_allowed\" varchar(100)\n \"goals_penalty_shot_allowed\" varchar(100)\n \"goals_against_average\" varchar(100)\n \"goals_against_total\" varchar(100)\n \"saves\" varchar(100)\n \"save_percentage\" varchar(100)\n \"catches_punches\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_shootout_total\" varchar(100)\n \"shots_shootout_allowed\" varchar(100)\n \"shots_blocked\" varchar(100)\n \"shutouts\" varchar(100)\n}\n\nTable \"soccer_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"period_value\" varchar(100)\n \"period_time_elapsed\" varchar(100)\n \"period_time_remaining\" varchar(100)\n \"minutes_elapsed\" varchar(100)\n \"period_minute_elapsed\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"soccer_foul_stats\" {\n \"id\" int4 [not null, increment]\n \"fouls_suffered\" varchar(100)\n \"fouls_commited\" varchar(100)\n \"cautions_total\" varchar(100)\n \"cautions_pending\" varchar(100)\n \"caution_points_total\" varchar(100)\n \"caution_points_pending\" varchar(100)\n \"ejections_total\" varchar(100)\n}\n\nTable \"soccer_offensive_stats\" {\n \"id\" int4 [not null, increment]\n \"goals_game_winning\" varchar(100)\n \"goals_game_tying\" varchar(100)\n \"goals_overtime\" varchar(100)\n \"goals_shootout\" varchar(100)\n \"goals_total\" varchar(100)\n \"assists_game_winning\" varchar(100)\n \"assists_game_tying\" varchar(100)\n \"assists_overtime\" varchar(100)\n \"assists_total\" varchar(100)\n \"points\" varchar(100)\n \"shots_total\" varchar(100)\n \"shots_on_goal_total\" varchar(100)\n \"shots_hit_frame\" varchar(100)\n \"shots_penalty_shot_taken\" varchar(100)\n \"shots_penalty_shot_scored\" varchar(100)\n \"shots_penalty_shot_missed\" varchar(40)\n \"shots_penalty_shot_percentage\" varchar(40)\n \"shots_shootout_taken\" varchar(40)\n \"shots_shootout_scored\" varchar(40)\n \"shots_shootout_missed\" varchar(40)\n \"shots_shootout_percentage\" varchar(40)\n \"giveaways\" varchar(40)\n \"offsides\" varchar(40)\n \"corner_kicks\" varchar(40)\n \"hat_tricks\" varchar(40)\n}\n\nTable \"standing_subgroups\" {\n \"id\" int4 [not null, increment]\n \"standing_id\" int4 [not null]\n \"affiliation_id\" int4 [not null]\n}\n\nTable \"standings\" {\n \"id\" int4 [not null, increment]\n \"affiliation_id\" int4 [not null]\n \"standing_type\" varchar(100)\n \"sub_season_id\" int4 [not null]\n \"last_updated\" varchar(100)\n \"duration_scope\" varchar(100)\n \"competition_scope\" varchar(100)\n \"competition_scope_id\" varchar(100)\n \"alignment_scope\" varchar(100)\n \"site_scope\" varchar(100)\n \"scoping_label\" varchar(100)\n \"publisher_id\" int4 [not null]\n \"source\" varchar(100)\n}\n\nTable \"stats\" {\n \"id\" int4 [not null, increment]\n \"stat_repository_type\" varchar(100)\n \"stat_repository_id\" int4 [not null]\n \"stat_holder_type\" varchar(100)\n \"stat_holder_id\" int4\n \"stat_coverage_type\" varchar(100)\n \"stat_coverage_id\" int4\n \"context\" varchar(40) [not null]\n}\n\nTable \"sub_periods\" {\n \"id\" int4 [not null, increment]\n \"period_id\" int4 [not null]\n \"sub_period_value\" varchar(100)\n \"score\" varchar(100)\n}\n\nTable \"sub_seasons\" {\n \"id\" int4 [not null, increment]\n \"sub_season_key\" varchar(100) [not null]\n \"season_id\" int4 [not null]\n \"sub_season_type\" varchar(100) [not null]\n \"start_date_time\" timestamp\n \"end_date_time\" timestamp\n}\n\nTable \"team_american_football_stats\" {\n \"id\" int4 [not null, increment]\n \"yards_per_attempt\" varchar(100)\n \"average_starting_position\" varchar(100)\n \"timeouts\" varchar(100)\n \"time_of_possession\" varchar(100)\n \"turnover_ratio\" varchar(100)\n}\n\nTable \"team_phases\" {\n \"id\" int4 [not null, increment]\n \"team_id\" int4 [not null]\n \"start_season_id\" int4\n \"end_season_id\" int4\n \"affiliation_id\" int4 [not null]\n \"start_date_time\" varchar(100)\n \"end_date_time\" varchar(100)\n \"phase_status\" varchar(40)\n \"role_id\" int4\n}\n\nTable \"teams\" {\n \"id\" int4 [not null, increment]\n \"team_key\" varchar(100) [not null]\n \"publisher_id\" int4 [not null]\n \"home_site_id\" int4\n}\n\nTable \"teams_documents\" {\n \"team_id\" int4 [not null]\n \"document_id\" int4 [not null]\n}\n\nTable \"teams_media\" {\n \"team_id\" int4 [not null]\n \"media_id\" int4 [not null]\n}\n\nTable \"tennis_action_points\" {\n \"id\" int4 [not null, increment]\n \"sub_period_id\" varchar(100)\n \"sequence_number\" varchar(100)\n \"win_type\" varchar(100)\n}\n\nTable \"tennis_action_volleys\" {\n \"id\" int4 [not null, increment]\n \"sequence_number\" varchar(100)\n \"tennis_action_points_id\" int4\n \"landing_location\" varchar(100)\n \"swing_type\" varchar(100)\n \"result\" varchar(100)\n \"spin_type\" varchar(100)\n \"trajectory_details\" varchar(100)\n}\n\nTable \"tennis_event_states\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"current_state\" int4\n \"sequence_number\" int4\n \"tennis_set\" varchar(100)\n \"game\" varchar(100)\n \"server_person_id\" int4\n \"server_score\" varchar(100)\n \"receiver_person_id\" int4\n \"receiver_score\" varchar(100)\n \"service_number\" varchar(100)\n \"context\" varchar(40)\n}\n\nTable \"tennis_return_stats\" {\n \"id\" int4 [not null, increment]\n \"returns_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"first_service_return_points_won\" varchar(100)\n \"first_service_return_points_won_pct\" varchar(100)\n \"second_service_return_points_won\" varchar(100)\n \"second_service_return_points_won_pct\" varchar(100)\n \"return_games_played\" varchar(100)\n \"return_games_won\" varchar(100)\n \"return_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_converted\" varchar(100)\n \"break_points_converted_pct\" varchar(100)\n}\n\nTable \"tennis_service_stats\" {\n \"id\" int4 [not null, increment]\n \"services_played\" varchar(100)\n \"matches_played\" varchar(100)\n \"aces\" varchar(100)\n \"first_services_good\" varchar(100)\n \"first_services_good_pct\" varchar(100)\n \"first_service_points_won\" varchar(100)\n \"first_service_points_won_pct\" varchar(100)\n \"second_service_points_won\" varchar(100)\n \"second_service_points_won_pct\" varchar(100)\n \"service_games_played\" varchar(100)\n \"service_games_won\" varchar(100)\n \"service_games_won_pct\" varchar(100)\n \"break_points_played\" varchar(100)\n \"break_points_saved\" varchar(100)\n \"break_points_saved_pct\" varchar(100)\n}\n\nTable \"wagering_moneylines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_odds_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"numerator\" varchar(100)\n \"denominator\" varchar(100)\n \"prediction\" varchar(100)\n \"payout_calculation\" varchar(100)\n \"payout_amount\" varchar(100)\n}\n\nTable \"wagering_runlines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line\" varchar(100)\n \"line_opening\" varchar(100)\n \"line_value\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_straight_spread_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_value\" varchar(100)\n \"line_value_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"wagering_total_score_lines\" {\n \"id\" int4 [not null, increment]\n \"bookmaker_id\" int4 [not null]\n \"event_id\" int4 [not null]\n \"date_time\" timestamp\n \"team_id\" int4 [not null]\n \"person_id\" int4\n \"rotation_key\" varchar(100)\n \"comment\" varchar(100)\n \"vigorish\" varchar(100)\n \"line_over\" varchar(100)\n \"line_under\" varchar(100)\n \"total\" varchar(100)\n \"total_opening\" varchar(100)\n \"prediction\" varchar(100)\n}\n\nTable \"weather_conditions\" {\n \"id\" int4 [not null, increment]\n \"event_id\" int4 [not null]\n \"temperature\" varchar(100)\n \"temperature_units\" varchar(40)\n \"humidity\" varchar(100)\n \"clouds\" varchar(100)\n \"wind_direction\" varchar(100)\n \"wind_velocity\" varchar(100)\n \"weather_code\" varchar(100)\n}\n",
"stateUrl": null,
"stateOriginalUrl": "https://github.com/yugabyte/yugabyte-db/blob/master/sample/sportsdb_tables.sql"
}
}
================================================
FILE: tasks/postgres/standard/sports/team_roster_management/verify.py
================================================
"""
Verification script for PostgreSQL Sports Task 2: Team Roster Management Operations
"""
import os
import sys
import psycopg2
from decimal import Decimal
def rows_match(actual_row, expected_row):
"""
Compare two rows with appropriate tolerance.
For Decimal types: allows 0.001 tolerance
For other types: requires exact match
"""
if len(actual_row) != len(expected_row):
return False
for actual, expected in zip(actual_row, expected_row):
if isinstance(actual, Decimal) and isinstance(expected, Decimal):
if abs(float(actual) - float(expected)) > 0.001:
return False
elif isinstance(actual, float) and isinstance(expected, float):
if abs(actual - expected) > 0.001:
return False
elif actual != expected:
return False
return True
def get_connection_params() -> dict:
"""Get database connection parameters."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE", "sports"),
"user": os.getenv("POSTGRES_USERNAME", "postgres"),
"password": os.getenv("POSTGRES_PASSWORD", "postgres")
}
def verify_player_evaluation_table(conn) -> bool:
"""Verify the final state of player_evaluation table after all operations."""
with conn.cursor() as cur:
# Get actual results from the created table
cur.execute("""
SELECT person_id, batting_avg, home_runs, rbis, games_played, performance_score
FROM player_evaluation
ORDER BY person_id
""")
actual_results = cur.fetchall()
# Execute ground truth query that simulates all steps:
# 1. Initial insert (step 2)
# 2. Update based on injuries (step 4)
cur.execute("""
WITH initial_players AS (
SELECT
s.stat_holder_id AS person_id,
SUM(bos.hits) AS total_hits,
SUM(bos.at_bats) AS total_at_bats,
CASE
WHEN SUM(bos.at_bats) > 0
THEN 1.0 * SUM(bos.hits) / SUM(bos.at_bats)
ELSE 0
END AS batting_avg,
SUM(bos.home_runs) AS home_runs,
SUM(bos.rbi) AS rbis
FROM stats s
JOIN baseball_offensive_stats bos
ON s.stat_repository_id = bos.id
WHERE s.stat_holder_type = 'persons'
AND s.stat_repository_type = 'baseball_offensive_stats'
GROUP BY s.stat_holder_id
),
game_counts AS (
SELECT
person_id,
COUNT(DISTINCT event_id) AS games_played
FROM person_event_metadata
GROUP BY person_id
),
players_with_games AS (
SELECT
ip.person_id,
ip.batting_avg,
ip.home_runs,
ip.rbis,
COALESCE(gc.games_played, 0) AS games_played,
(ip.batting_avg * 1000)
+ (COALESCE(ip.home_runs, 0) * 5)
+ (COALESCE(ip.rbis, 0) * 2) AS initial_score
FROM initial_players ip
LEFT JOIN game_counts gc ON ip.person_id = gc.person_id
WHERE COALESCE(gc.games_played, 0) >= 10
),
injury_info AS (
SELECT
person_id,
COUNT(*) AS injury_count,
MAX(CASE WHEN end_date_time IS NULL THEN 1 ELSE 0 END) AS has_active_injury
FROM injury_phases
GROUP BY person_id
),
adjusted_scores AS (
SELECT
pwg.person_id,
pwg.batting_avg,
pwg.home_runs,
pwg.rbis,
pwg.games_played,
GREATEST(
CASE
WHEN COALESCE(ii.has_active_injury, 0) = 1 AND COALESCE(ii.injury_count, 0) > 2
THEN pwg.initial_score * 0.8 * 0.9
WHEN COALESCE(ii.has_active_injury, 0) = 1
THEN pwg.initial_score * 0.8
WHEN COALESCE(ii.injury_count, 0) > 2
THEN pwg.initial_score * 0.9
ELSE pwg.initial_score
END,
0
) AS performance_score
FROM players_with_games pwg
LEFT JOIN injury_info ii ON ii.person_id = pwg.person_id
)
SELECT
person_id,
batting_avg,
home_runs,
rbis,
games_played,
performance_score
FROM adjusted_scores
ORDER BY person_id;
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} player evaluation records, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5: # Only show first 5 mismatches
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches in player_evaluation: {mismatches}")
return False
print(f"✅ Player evaluation table is correct ({len(actual_results)} records)")
return True
def verify_injury_status_table(conn) -> bool:
"""Verify the player_injury_status table and data."""
with conn.cursor() as cur:
# Get actual results
cur.execute("""
SELECT person_id, injury_count, last_injury_date, current_status
FROM player_injury_status
ORDER BY person_id
""")
actual_results = cur.fetchall()
# Execute ground truth query - get players from player_evaluation
cur.execute("""
WITH player_list AS (
SELECT DISTINCT person_id
FROM player_evaluation
),
injury_counts AS (
SELECT
person_id,
COUNT(*) as injury_count,
MAX(start_date_time::date) as last_injury_date,
MAX(CASE WHEN end_date_time IS NULL THEN 1 ELSE 0 END) as has_active_injury
FROM injury_phases
GROUP BY person_id
)
SELECT
pl.person_id,
COALESCE(ic.injury_count, 0) as injury_count,
ic.last_injury_date,
CASE
WHEN COALESCE(ic.has_active_injury, 0) = 1 THEN 'injured'
ELSE 'healthy'
END as current_status
FROM player_list pl
LEFT JOIN injury_counts ic ON pl.person_id = ic.person_id
ORDER BY pl.person_id
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} injury status records, got {len(actual_results)}")
return False
mismatches = 0
for i, (actual, expected) in enumerate(zip(actual_results, expected_results)):
if not rows_match(actual, expected):
if mismatches < 5:
print(f"❌ Row {i+1} mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches in player_injury_status: {mismatches}")
return False
print(f"✅ Player injury status table is correct ({len(actual_results)} records)")
return True
def verify_summary_table(conn) -> bool:
"""Verify the team_performance_summary table."""
with conn.cursor() as cur:
# Get actual results
cur.execute("""
SELECT metric_name, metric_value
FROM team_performance_summary
ORDER BY metric_name
""")
actual_results = cur.fetchall()
# Execute ground truth query
cur.execute("""
WITH player_data AS (
SELECT
COUNT(*) as total_players,
AVG(batting_avg) as avg_batting_average,
SUM(home_runs) as total_home_runs,
AVG(performance_score) as avg_performance_score
FROM player_evaluation
),
health_data AS (
SELECT
SUM(CASE WHEN current_status = 'injured' THEN 1 ELSE 0 END) as injured_count,
SUM(CASE WHEN current_status = 'healthy' THEN 1 ELSE 0 END) as healthy_count
FROM player_injury_status
WHERE person_id IN (SELECT person_id FROM player_evaluation)
)
SELECT metric_name, metric_value::DECIMAL
FROM (
SELECT 'avg_batting_average' as metric_name, avg_batting_average as metric_value FROM player_data
UNION ALL
SELECT 'avg_performance_score', avg_performance_score FROM player_data
UNION ALL
SELECT 'healthy_player_count', healthy_count FROM health_data
UNION ALL
SELECT 'injured_player_count', injured_count FROM health_data
UNION ALL
SELECT 'total_home_runs', total_home_runs FROM player_data
UNION ALL
SELECT 'total_players', total_players FROM player_data
) metrics
ORDER BY metric_name
""")
expected_results = cur.fetchall()
if len(actual_results) != len(expected_results):
print(f"❌ Expected {len(expected_results)} metrics, got {len(actual_results)}")
return False
mismatches = 0
for actual, expected in zip(actual_results, expected_results):
if not rows_match(actual, expected):
if mismatches < 5:
print(f"❌ Metric mismatch: expected {expected}, got {actual}")
mismatches += 1
if mismatches > 0:
print(f"❌ Total mismatches in summary table: {mismatches}")
return False
print(f"✅ Team performance summary table is correct ({len(actual_results)} metrics)")
return True
def main():
"""Main verification function."""
print("=" * 50)
print("Verifying Sports Task 2: Team Roster Management Operations")
print("=" * 50)
# Get connection parameters
conn_params = get_connection_params()
if not conn_params["database"]:
print("❌ No database specified")
sys.exit(1)
try:
# Connect to database
conn = psycopg2.connect(**conn_params)
# Verify all steps
success = (
verify_player_evaluation_table(conn) and
verify_injury_status_table(conn) and
verify_summary_table(conn)
)
conn.close()
if success:
print("\n🎉 Task verification: PASS")
sys.exit(0)
else:
print("\n❌ Task verification: FAIL")
sys.exit(1)
except psycopg2.Error as e:
print(f"❌ Database error: {e}")
sys.exit(1)
except Exception as e:
print(f"❌ Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/description.md
================================================
# PostgreSQL Vector Database Analysis
> Analyze and optimize a pgvector-powered database to understand storage patterns, performance characteristics, and data quality for embeddings in production workloads.
## What's this about?
You've got a PostgreSQL database running with the vector extension that stores embeddings for RAG (document similarity search, image recognition), or other ML workloads.
Your job is to dive deep into this vector database and figure out what's going on under the hood.
You need to understand:
- how vectors are stored
- how much space they're taking up
- whether indexes are working properly
- if there are any data quality issues lurking around
## What you need to investigate
First, get familiar with what you're working with:
- Check vector extension status: ensuring it's installed properly, check version, identify any configuration issues
- Identify all vector columns across entire database: providing me columns, types of columns, and vector dim (dimensions)
- Map the vector landscape: understand relationships between vector tables and regular tables, foreign keys, dependencies
Vectors can eat up a lot of storage, so let's see where the bytes are going:
- Calculate vector storage overhead: measure how much space vectors take compared to regular columns in same tables
- Analyze table sizes: identify which vector tables are biggest storage consumers, break down by table
- Understand growth patterns: examine record counts and project future storage needs based on current data
Vectors without proper indexes are painfully slow, so investigate:
- Catalog vector indexes: find all HNSW and IVFFlat indexes, document their configurations and parameters
- Measure index effectiveness: determine if indexes are actually being used and helping query performance
- Identify optimization opportunities: spot missing indexes, suboptimal configurations, unused indexes
Bad vector data makes everything worse:
- Hunt for data issues: locate NULL vectors, dimension mismatches, corrupted embeddings that could break queries
- Validate consistency: ensure vectors in each column have consistent dimensions across all rows
- Check for outliers: find vectors that might be skewing similarity calculations or causing performance issues
## Your deliverables
Create these analysis tables and populate them with your findings:
### `vector_analysis_columns`
Complete catalog of every vector column you find:
```sql
CREATE TABLE vector_analysis_columns (
schema VARCHAR(50),
table_name VARCHAR(100),
column_name VARCHAR(100),
dimensions INTEGER,
data_type VARCHAR(50),
has_constraints BOOLEAN,
rows BIGINT
);
```
### `vector_analysis_storage_consumption`
Show exactly where storage is being consumed:
```sql
CREATE TABLE vector_analysis_storage_consumption (
schema VARCHAR(50),
table_name VARCHAR(100),
total_size_bytes BIGINT,
vector_data_bytes BIGINT,
regular_data_bytes BIGINT,
vector_storage_pct NUMERIC(5,2),
row_count BIGINT
);
```
### `vector_analysis_indices`
Document all vector indexes and their characteristics:
```sql
CREATE TABLE vector_analysis_indices (
schema VARCHAR(50),
table_name VARCHAR(100),
column_name VARCHAR(100),
index_name VARCHAR(100),
index_type VARCHAR(50), -- 'hnsw', 'ivfflat', etc.
index_size_bytes BIGINT
);
```
Use PostgreSQL system catalogs, pgvector-specific views, and storage analysis functions to gather comprehensive metrics about the vector database implementation.
================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/ground_truth.sql
================================================
-- Ground Truth Data for Vector Database Analysis Task
-- This defines the exact expected results that candidates should discover and report
/*
================================================================================
EXPECTED VECTOR DATABASE STRUCTURE (created by vectors_setup.py)
================================================================================
Tables with Vector Columns:
1. documents.embedding (vector(1536))
2. document_chunks.embedding (vector(1536))
3. user_queries.embedding (vector(1536))
Vector Indexes:
1. documents_embedding_idx (HNSW on documents.embedding)
2. chunks_embedding_idx (HNSW on document_chunks.embedding)
3. queries_embedding_idx (HNSW on user_queries.embedding)
Expected Data Counts:
- documents: 10 records
- document_chunks: ~40-70 records (3-7 chunks per document)
- user_queries: 10 records
- embedding_models: 5 records (metadata)
- knowledge_base: 5 records (metadata)
- search_cache: 5 records (metadata)
================================================================================
DEFINITIVE GROUND TRUTH VERIFICATION DATA
================================================================================
*/
BEGIN;
-- Create expected analysis result structure
CREATE TABLE IF NOT EXISTS expected_vector_column_inventory (
table_schema VARCHAR(50) DEFAULT 'public',
table_name VARCHAR(100),
column_name VARCHAR(100),
vector_dimensions INTEGER,
data_type VARCHAR(50) DEFAULT 'USER-DEFINED',
has_constraints BOOLEAN DEFAULT false,
min_estimated_rows BIGINT
);
-- Insert expected vector column inventory
INSERT INTO expected_vector_column_inventory (table_name, column_name, vector_dimensions, min_estimated_rows) VALUES
('documents', 'embedding', 1536, 10),
('document_chunks', 'embedding', 1536, 30),
('user_queries', 'embedding', 1536, 10);
-- Create expected storage analysis structure
CREATE TABLE IF NOT EXISTS expected_vector_storage_analysis (
table_name VARCHAR(100),
has_vector_data BOOLEAN,
min_row_count BIGINT,
vector_column_exists BOOLEAN,
should_have_storage_metrics BOOLEAN DEFAULT true
);
-- Insert expected storage analysis
INSERT INTO expected_vector_storage_analysis (table_name, has_vector_data, min_row_count, vector_column_exists) VALUES
('documents', true, 10, true),
('document_chunks', true, 30, true),
('user_queries', true, 10, true),
('embedding_models', false, 5, false),
('knowledge_base', false, 5, false),
('search_cache', false, 5, false);
-- Create expected index analysis structure
CREATE TABLE IF NOT EXISTS expected_vector_index_analysis (
index_name_pattern VARCHAR(100),
table_name VARCHAR(100),
column_name VARCHAR(100),
expected_index_type VARCHAR(50),
should_exist BOOLEAN DEFAULT true
);
-- Insert expected vector index analysis
INSERT INTO expected_vector_index_analysis (index_name_pattern, table_name, column_name, expected_index_type) VALUES
('%documents%embedding%', 'documents', 'embedding', 'hnsw'),
('%chunks%embedding%', 'document_chunks', 'embedding', 'hnsw'),
('%queries%embedding%', 'user_queries', 'embedding', 'hnsw');
-- Create storage analysis table
CREATE TABLE vector_storage_analysis (
table_name VARCHAR(100),
total_size_bytes BIGINT,
vector_data_bytes BIGINT,
regular_data_bytes BIGINT,
vector_storage_pct NUMERIC(5,2),
row_count BIGINT,
avg_vector_size_bytes INTEGER
);
-- Populate storage analysis with actual storage metrics
DO $$
DECLARE
rec RECORD;
total_size BIGINT;
row_cnt BIGINT;
vector_size INTEGER := 1536 * 4; -- 1536 dimensions * 4 bytes per float
BEGIN
FOR rec IN SELECT tablename FROM pg_tables WHERE tablename IN ('documents', 'document_chunks', 'user_queries') LOOP
EXECUTE format('SELECT COUNT(*) FROM %I', rec.tablename) INTO row_cnt;
SELECT pg_total_relation_size(format('public.%I', rec.tablename)) INTO total_size;
INSERT INTO vector_storage_analysis (
table_name, total_size_bytes, row_count, avg_vector_size_bytes,
vector_data_bytes, regular_data_bytes, vector_storage_pct
) VALUES (
rec.tablename,
total_size,
row_cnt,
vector_size,
row_cnt * vector_size,
GREATEST(total_size - (row_cnt * vector_size), 0),
ROUND((row_cnt * vector_size * 100.0) / NULLIF(total_size, 0), 2)
);
END LOOP;
END $$;
-- Create index analysis table
CREATE TABLE vector_index_analysis (
index_name VARCHAR(100),
table_name VARCHAR(100),
column_name VARCHAR(100),
index_type VARCHAR(50),
index_size_bytes BIGINT,
index_parameters TEXT,
is_valid BOOLEAN
);
-- Populate index analysis with actual vector indexes
INSERT INTO vector_index_analysis (index_name, table_name, column_name, index_type, index_size_bytes, is_valid)
SELECT
i.indexname as index_name,
i.tablename as table_name,
'embedding' as column_name, -- Known from our setup
CASE
WHEN i.indexdef ILIKE '%hnsw%' THEN 'hnsw'
WHEN i.indexdef ILIKE '%ivfflat%' THEN 'ivfflat'
ELSE 'unknown'
END as index_type,
pg_relation_size(format('public.%I', i.indexname)) as index_size_bytes,
true as is_valid
FROM pg_indexes i
WHERE (i.indexdef ILIKE '%vector%' OR i.indexdef ILIKE '%hnsw%' OR i.indexdef ILIKE '%ivfflat%')
AND i.tablename IN ('documents', 'document_chunks', 'user_queries')
ORDER BY i.tablename, i.indexname;
-- Create data quality analysis table
CREATE TABLE vector_data_quality (
table_name VARCHAR(100),
column_name VARCHAR(100),
quality_check_type VARCHAR(50),
total_records BIGINT,
issue_count BIGINT,
quality_status VARCHAR(20),
details TEXT
);
-- Populate data quality analysis with actual checks
DO $$
DECLARE
rec RECORD;
total_cnt BIGINT;
null_cnt BIGINT;
BEGIN
FOR rec IN SELECT tablename FROM pg_tables WHERE tablename IN ('documents', 'document_chunks', 'user_queries') LOOP
-- Count total records
EXECUTE format('SELECT COUNT(*) FROM %I', rec.tablename) INTO total_cnt;
-- Count NULL vectors
EXECUTE format('SELECT COUNT(*) FROM %I WHERE embedding IS NULL', rec.tablename) INTO null_cnt;
-- Insert NULL_CHECK result
INSERT INTO vector_data_quality (
table_name, column_name, quality_check_type,
total_records, issue_count, quality_status
) VALUES (
rec.tablename, 'embedding', 'NULL_CHECK',
total_cnt, null_cnt,
CASE WHEN null_cnt = 0 THEN 'GOOD' ELSE 'WARNING' END
);
-- Insert DIMENSION_CHECK result (all vectors in our setup are 1536-dimensional)
INSERT INTO vector_data_quality (
table_name, column_name, quality_check_type,
total_records, issue_count, quality_status
) VALUES (
rec.tablename, 'embedding', 'DIMENSION_CHECK',
total_cnt - null_cnt, 0, 'GOOD'
);
END LOOP;
END $$;
-- ============================================================================
-- GROUND TRUTH IMPLEMENTATION
-- ============================================================================
-- This is the correct analysis implementation that candidates should produce
-- Create vector_analysis_columns table and populate it
CREATE TABLE vector_analysis_columns (
schema VARCHAR(50),
table_name VARCHAR(100),
column_name VARCHAR(100),
dimensions INTEGER,
data_type VARCHAR(50),
has_constraints BOOLEAN,
rows BIGINT
);
-- Discover and insert vector columns
INSERT INTO vector_analysis_columns (schema, table_name, column_name, dimensions, data_type, has_constraints, rows)
SELECT
'public' as schema,
c.table_name,
c.column_name,
1536 as dimensions, -- pgvector embedding dimension
'USER-DEFINED' as data_type,
false as has_constraints,
-- Get actual row count using dynamic query
CASE c.table_name
WHEN 'documents' THEN (SELECT COUNT(*) FROM documents)
WHEN 'document_chunks' THEN (SELECT COUNT(*) FROM document_chunks)
WHEN 'user_queries' THEN (SELECT COUNT(*) FROM user_queries)
ELSE 0
END as rows
FROM information_schema.columns c
WHERE c.data_type = 'USER-DEFINED'
AND c.udt_name = 'vector'
ORDER BY c.table_name, c.column_name;
-- Create vector_analysis_storage_consumption table
CREATE TABLE vector_analysis_storage_consumption (
schema VARCHAR(50),
table_name VARCHAR(100),
total_size_bytes BIGINT,
vector_data_bytes BIGINT,
regular_data_bytes BIGINT,
vector_storage_pct NUMERIC(5,2),
row_count BIGINT
);
-- Populate storage analysis for vector tables
DO $$
DECLARE
rec RECORD;
total_size BIGINT;
row_cnt BIGINT;
vector_size INTEGER := 1536 * 4; -- 1536 dimensions * 4 bytes per float
BEGIN
FOR rec IN
SELECT DISTINCT c.table_name
FROM information_schema.columns c
WHERE c.data_type = 'USER-DEFINED'
AND c.udt_name = 'vector'
LOOP
-- Get actual row count
EXECUTE format('SELECT COUNT(*) FROM %I', rec.table_name) INTO row_cnt;
-- Get actual table size
SELECT pg_total_relation_size(format('public.%I', rec.table_name)) INTO total_size;
-- Insert analysis results
INSERT INTO vector_analysis_storage_consumption (
schema, table_name, total_size_bytes, vector_data_bytes,
regular_data_bytes, vector_storage_pct, row_count
) VALUES (
'public',
rec.table_name,
total_size,
row_cnt * vector_size,
GREATEST(total_size - (row_cnt * vector_size), 0),
ROUND((row_cnt * vector_size * 100.0) / NULLIF(total_size, 0), 2),
row_cnt
);
END LOOP;
END $$;
-- Create vector_analysis_indices table
CREATE TABLE vector_analysis_indices (
schema VARCHAR(50),
table_name VARCHAR(100),
column_name VARCHAR(100),
index_name VARCHAR(100),
index_type VARCHAR(50),
index_size_bytes BIGINT
);
-- Populate index analysis for vector indexes
INSERT INTO vector_analysis_indices (schema, table_name, column_name, index_name, index_type, index_size_bytes)
SELECT
i.schemaname as schema,
i.tablename as table_name,
'embedding' as column_name, -- known from our setup
i.indexname as index_name,
CASE
WHEN i.indexdef ILIKE '%hnsw%' THEN 'hnsw'
WHEN i.indexdef ILIKE '%ivfflat%' THEN 'ivfflat'
ELSE 'unknown'
END as index_type,
pg_relation_size(format('public.%I', i.indexname)) as index_size_bytes
FROM pg_indexes i
WHERE (i.indexdef ILIKE '%hnsw%' OR i.indexdef ILIKE '%ivfflat%')
AND i.tablename IN (
SELECT DISTINCT table_name
FROM information_schema.columns
WHERE data_type = 'USER-DEFINED' AND udt_name = 'vector'
)
ORDER BY i.tablename, i.indexname;
COMMIT;
-- ============================================================================
-- VERIFICATION HELPER QUERIES
-- ============================================================================
-- Query to check actual vector columns in the database
/*
SELECT
table_schema,
table_name,
column_name,
data_type,
udt_name
FROM information_schema.columns
WHERE data_type = 'USER-DEFINED'
AND udt_name = 'vector'
ORDER BY table_name, column_name;
*/
-- Query to check actual vector indexes
/*
SELECT
schemaname,
tablename,
indexname,
indexdef
FROM pg_indexes
WHERE indexdef ILIKE '%vector%'
OR indexdef ILIKE '%hnsw%'
OR indexdef ILIKE '%ivfflat%'
ORDER BY tablename, indexname;
*/
-- Query to check table row counts
/*
SELECT
'documents' as table_name, COUNT(*) as row_count FROM documents
UNION ALL
SELECT
'document_chunks' as table_name, COUNT(*) as row_count FROM document_chunks
UNION ALL
SELECT
'user_queries' as table_name, COUNT(*) as row_count FROM user_queries
ORDER BY table_name;
*/
-- Query to check pgvector extension
/*
SELECT extname, extversion
FROM pg_extension
WHERE extname = 'vector';
*/
================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/meta.json
================================================
{
"task_id": "dba_vector_analysis",
"task_name": "DBA Vector Analysis",
"category_id": "vectors",
"category_name": "Vectors",
"description": "Analyze pgvector database storage, identify vector columns, assess space utilization and performance for RAG applications.",
"author": "Fanshi Zhang",
"created_at": "2025-08-18",
"difficulty": "L3",
"tags": [
"performance optimization",
"audit and compliance",
"statistical aggregation"
],
"mcp": [
"postgres"
],
"meta_data": {
"stateType": "text",
"stateContent": "Table \"documents\" {\n \"id\" int4 [pk, not null, increment]\n \"title\" text [not null]\n \"content\" text [not null]\n \"source_url\" text\n \"document_type\" varchar(50) [default: 'article']\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"word_count\" int4\n \"embedding\" public.vector\n\n Indexes {\n created_at [type: btree, name: \"documents_created_idx\"]\n embedding [type: hnsw, name: \"documents_embedding_idx\"]\n title [type: btree, name: \"documents_title_idx\"]\n document_type [type: btree, name: \"documents_type_idx\"]\n }\n}\n\nTable \"document_chunks\" {\n \"id\" int4 [pk, not null, increment]\n \"document_id\" int4\n \"chunk_index\" int4 [not null]\n \"chunk_text\" text [not null]\n \"chunk_size\" int4\n \"overlap_size\" int4 [default: 0]\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"embedding\" public.vector\n\n Indexes {\n document_id [type: btree, name: \"chunks_doc_id_idx\"]\n embedding [type: hnsw, name: \"chunks_embedding_idx\"]\n chunk_index [type: btree, name: \"chunks_index_idx\"]\n }\n}\n\nTable \"user_queries\" {\n \"id\" int4 [pk, not null, increment]\n \"query_text\" text [not null]\n \"user_id\" varchar(100)\n \"session_id\" varchar(100)\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"response_time_ms\" int4\n \"embedding\" public.vector\n\n Indexes {\n created_at [type: btree, name: \"queries_created_idx\"]\n embedding [type: hnsw, name: \"queries_embedding_idx\"]\n user_id [type: btree, name: \"queries_user_idx\"]\n }\n}\n\nTable \"embedding_models\" {\n \"id\" int4 [pk, not null, increment]\n \"model_name\" varchar(100) [unique, not null]\n \"provider\" varchar(50) [not null]\n \"dimensions\" int4 [not null]\n \"max_tokens\" int4\n \"cost_per_token\" numeric(10,8)\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"is_active\" bool [default: true]\n}\n\nTable \"knowledge_base\" {\n \"id\" int4 [pk, not null, increment]\n \"kb_name\" varchar(100) [not null]\n \"description\" text\n \"domain\" varchar(50)\n \"language\" varchar(10) [default: 'en']\n \"total_documents\" int4 [default: 0]\n \"total_chunks\" int4 [default: 0]\n \"total_storage_mb\" numeric(10,2)\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"updated_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n}\n\nTable \"search_cache\" {\n \"id\" int4 [pk, not null, increment]\n \"query_hash\" varchar(64) [not null]\n \"query_text\" text [not null]\n \"results_json\" jsonb\n \"result_count\" int4\n \"search_time_ms\" int4\n \"similarity_threshold\" numeric(4,3)\n \"created_at\" timestamp [default: `CURRENT_TIMESTAMP`]\n \"expires_at\" timestamp\n\n Indexes {\n expires_at [type: btree, name: \"cache_expires_idx\"]\n query_hash [type: btree, name: \"cache_hash_idx\"]\n }\n}\n\nRef \"document_chunks_document_id_fkey\":\"documents\".\"id\" < \"document_chunks\".\"document_id\" [delete: cascade]\n",
"stateUrl": null,
"stateOriginalUrl": null
}
}
================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/prepare_environment.py
================================================
"""
Environment preparation script for Vector Database DBA Analysis task.
This script imports and uses the shared vector database setup utilities.
"""
import sys
import logging
from pathlib import Path
# Add the vectors directory to import the shared utilities
sys.path.append(str(Path(__file__).resolve().parents[1]))
from vectors_setup import prepare_vector_environment
logger = logging.getLogger(__name__)
def prepare_environment():
"""Main function to prepare the vector database environment."""
prepare_vector_environment()
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
prepare_environment()
================================================
FILE: tasks/postgres/standard/vectors/dba_vector_analysis/verify.py
================================================
"""
Verification script for Vector Database DBA Analysis task.
This script verifies that the candidate has properly analyzed the vector database
and stored their findings in appropriate result tables.
"""
import logging
import psycopg2
import os
import sys
from typing import Dict, Any
logger = logging.getLogger(__name__)
def get_connection_params():
"""Get database connection parameters from environment variables."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD"),
}
def verify_vector_analysis_columns(conn) -> Dict[str, Any]:
"""Verify the vector_analysis_columns table exists, has correct columns, and contains actual vector columns from the database."""
results = {'passed': False, 'issues': []}
expected_columns = [
'schema', 'table_name', 'column_name', 'dimensions', 'data_type', 'has_constraints', 'rows'
]
try:
with conn.cursor() as cur:
# Check if table exists
cur.execute("""
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'vector_analysis_columns'
);
""")
if not cur.fetchone()[0]:
results['issues'].append("vector_analysis_columns table not found")
return results
# Check columns
cur.execute("""
SELECT column_name FROM information_schema.columns
WHERE table_name = 'vector_analysis_columns'
ORDER BY column_name;
""")
actual_columns = {row[0] for row in cur.fetchall()}
missing = set(expected_columns) - actual_columns
extra = actual_columns - set(expected_columns)
if missing:
results['issues'].append(f"Missing columns: {missing}")
if extra:
results['issues'].append(f"Unexpected columns: {extra}")
# Check for data
cur.execute("SELECT COUNT(*) FROM vector_analysis_columns;")
count = cur.fetchone()[0]
if count == 0:
results['issues'].append("No rows found in vector_analysis_columns")
return results
# Get actual vector columns from the database
cur.execute("""
SELECT table_name, column_name
FROM information_schema.columns
WHERE data_type = 'USER-DEFINED'
AND udt_name = 'vector'
ORDER BY table_name, column_name;
""")
actual_vector_columns = set(cur.fetchall())
# Get what the agent found
cur.execute("""
SELECT table_name, column_name
FROM vector_analysis_columns
ORDER BY table_name, column_name;
""")
found_vector_columns = set(cur.fetchall())
# Check if agent found the actual vector columns
missing_vectors = actual_vector_columns - found_vector_columns
extra_vectors = found_vector_columns - actual_vector_columns
if missing_vectors:
results['issues'].append(f"Missing: {missing_vectors}")
if extra_vectors:
results['issues'].append(f"Non-existing: {extra_vectors}")
if not missing and not extra and count > 0 and not missing_vectors and not extra_vectors:
results['passed'] = True
except psycopg2.Error as e:
results['issues'].append(f"Database error: {e}")
except Exception as e:
results['issues'].append(f"Verification error: {e}")
return results
def verify_vector_analysis_storage_consumption(conn) -> Dict[str, Any]:
"""Verify the vector_analysis_storage_consumption table exists, has correct columns, and analyzes actual vector tables."""
results = {'passed': False, 'issues': []}
expected_columns = [
'schema', 'table_name', 'total_size_bytes', 'vector_data_bytes', 'regular_data_bytes', 'vector_storage_pct', 'row_count'
]
try:
with conn.cursor() as cur:
cur.execute("""
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'vector_analysis_storage_consumption'
);
""")
if not cur.fetchone()[0]:
results['issues'].append("vector_analysis_storage_consumption table not found")
return results
cur.execute("""
SELECT column_name FROM information_schema.columns
WHERE table_name = 'vector_analysis_storage_consumption'
ORDER BY column_name;
""")
actual_columns = {row[0] for row in cur.fetchall()}
missing = set(expected_columns) - actual_columns
extra = actual_columns - set(expected_columns)
if missing:
results['issues'].append(f"Missing columns: {missing}")
if extra:
results['issues'].append(f"Unexpected columns: {extra}")
cur.execute("SELECT COUNT(*) FROM vector_analysis_storage_consumption;")
count = cur.fetchone()[0]
if count == 0:
results['issues'].append("No rows found in vector_analysis_storage_consumption")
return results
# Get actual tables with vector columns
cur.execute("""
SELECT DISTINCT table_name
FROM information_schema.columns
WHERE data_type = 'USER-DEFINED'
AND udt_name = 'vector'
ORDER BY table_name;
""")
actual_vector_tables = {row[0] for row in cur.fetchall()}
# Get what the agent analyzed
cur.execute("""
SELECT DISTINCT table_name
FROM vector_analysis_storage_consumption
ORDER BY table_name;
""")
analyzed_tables = {row[0] for row in cur.fetchall()}
# Check if agent analyzed the actual vector tables
missing_tables = actual_vector_tables - analyzed_tables
if missing_tables:
results['issues'].append(f"Agent missed analyzing vector tables: {missing_tables}")
# Check that analyzed tables actually have vector columns
extra_tables = analyzed_tables - actual_vector_tables
if extra_tables:
results['issues'].append(f"Agent analyzed non-vector tables: {extra_tables}")
if not missing and not extra and count > 0 and not missing_tables and not extra_tables:
results['passed'] = True
except psycopg2.Error as e:
results['issues'].append(f"Database error: {e}")
except Exception as e:
results['issues'].append(f"Verification error: {e}")
return results
def verify_vector_analysis_indices(conn) -> Dict[str, Any]:
"""Verify the vector_analysis_indices table exists, has correct columns, and identifies actual vector indexes."""
results = {'passed': False, 'issues': []}
expected_columns = [
'schema', 'table_name', 'column_name', 'index_name', 'index_type', 'index_size_bytes'
]
try:
with conn.cursor() as cur:
cur.execute("""
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_name = 'vector_analysis_indices'
);
""")
if not cur.fetchone()[0]:
results['issues'].append("vector_analysis_indices table not found")
return results
cur.execute("""
SELECT column_name FROM information_schema.columns
WHERE table_name = 'vector_analysis_indices'
ORDER BY column_name;
""")
actual_columns = {row[0] for row in cur.fetchall()}
missing = set(expected_columns) - actual_columns
extra = actual_columns - set(expected_columns)
if missing:
results['issues'].append(f"Missing columns: {missing}")
if extra:
results['issues'].append(f"Unexpected columns: {extra}")
cur.execute("SELECT COUNT(*) FROM vector_analysis_indices;")
count = cur.fetchone()[0]
if count == 0:
results['issues'].append("No rows found in vector_analysis_indices")
return results
# Get actual vector indexes from the database (exclude ground truth table indexes)
cur.execute("""
SELECT schemaname, tablename, indexname
FROM pg_indexes
WHERE (indexdef ILIKE '%hnsw%' OR indexdef ILIKE '%ivfflat%')
AND tablename NOT LIKE '%analysis%'
ORDER BY tablename, indexname;
""")
actual_vector_indexes = set(cur.fetchall())
# Get what the agent found
cur.execute("""
SELECT schema, table_name, index_name
FROM vector_analysis_indices
ORDER BY table_name, index_name;
""")
found_indexes = set(cur.fetchall())
# Check if agent found the actual vector indexes
missing_indexes = actual_vector_indexes - found_indexes
if missing_indexes:
results['issues'].append(f"Agent missed vector indexes: {missing_indexes}")
# Allow agent to find more indexes than just vector ones (they might include related indexes)
# but at least they should find the vector-specific ones
if not missing and not extra and count > 0 and not missing_indexes:
results['passed'] = True
except psycopg2.Error as e:
results['issues'].append(f"Database error: {e}")
except Exception as e:
results['issues'].append(f"Verification error: {e}")
return results
def verify_no_extra_analysis_tables(conn) -> Dict[str, Any]:
"""Check that only the required analysis tables exist (no legacy/extra analysis tables)."""
results = {'passed': True, 'issues': []} # Start with passed=True, more lenient
required = {
'vector_analysis_columns',
'vector_analysis_storage_consumption',
'vector_analysis_indices',
}
try:
with conn.cursor() as cur:
cur.execute("""
SELECT table_name FROM information_schema.tables
WHERE table_schema = 'public'
AND table_name LIKE 'vector_analysis_%';
""")
analysis_tables = {row[0] for row in cur.fetchall()}
# Only flag as issue if there are analysis tables that don't match our required set
# Exclude ground truth tables from this check
analysis_tables_filtered = {t for t in analysis_tables if not t.startswith('expected_') and not t.startswith('vector_analysis_results')}
extra = analysis_tables_filtered - required
if extra:
results['issues'].append(f"Found unexpected analysis tables: {extra}")
results['passed'] = False
except Exception as e:
results['issues'].append(f"Verification error: {e}")
results['passed'] = False
return results
def main():
"""Main verification function for vector analysis deliverables."""
conn_params = get_connection_params()
if not conn_params["database"]:
print("No database specified")
sys.exit(1)
try:
conn = psycopg2.connect(**conn_params)
checks = [
("vector_analysis_columns", verify_vector_analysis_columns),
("vector_analysis_storage_consumption", verify_vector_analysis_storage_consumption),
("vector_analysis_indices", verify_vector_analysis_indices),
("no_extra_analysis_tables", verify_no_extra_analysis_tables),
]
passed_checks = 0
all_issues = []
for i, (desc, check_func) in enumerate(checks, 1):
result = check_func(conn)
if result['passed']:
print(f" PASSED")
passed_checks += 1
else:
print(f" FAILED")
for issue in result['issues']:
print(f" - {issue}")
all_issues.extend(result['issues'])
print()
conn.close()
total_checks = len(checks)
print(f"Results: {passed_checks}/{total_checks} checks passed")
if passed_checks == total_checks:
sys.exit(0)
elif passed_checks >= total_checks * 0.75:
sys.exit(0)
else:
sys.exit(1)
except psycopg2.Error as e:
print(f"Database connection error: {e}")
sys.exit(1)
except Exception as e:
print(f"Verification error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
================================================
FILE: tasks/postgres/standard/vectors/vectors_setup.py
================================================
"""
Shared Vector Database Setup Utilities
This module provides utilities for setting up a complete PostgreSQL database
with pgvector extension and sample RAG-related tables with vector data.
Used by all vector database tasks.
"""
import os
import logging
import psycopg2
import json
import random
import numpy as np
from typing import List
logger = logging.getLogger(__name__)
def get_connection_params():
"""Get database connection parameters from environment variables."""
return {
'host': os.getenv('POSTGRES_HOST', 'localhost'),
'port': os.getenv('POSTGRES_PORT', '5432'),
'user': os.getenv('POSTGRES_USERNAME', 'postgres'),
'password': os.getenv('POSTGRES_PASSWORD', 'password'),
'database': os.getenv('POSTGRES_DATABASE', 'postgres')
}
def generate_mock_embedding(dimensions: int = 1536) -> List[float]:
"""Generate a mock embedding vector with specified dimensions."""
# Generate random values between -1 and 1, then normalize
vector = np.random.uniform(-1, 1, dimensions)
# Normalize to unit vector (common practice for embeddings)
norm = np.linalg.norm(vector)
if norm > 0:
vector = vector / norm
return vector.tolist()
def create_vector_extension():
"""Create the pgvector extension."""
conn_params = get_connection_params()
try:
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
with conn.cursor() as cur:
logger.info("Creating pgvector extension...")
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
logger.info("pgvector extension created successfully")
conn.close()
except psycopg2.Error as e:
logger.error(f"Failed to create pgvector extension: {e}")
raise
def create_vector_tables():
"""Create sample tables with vector columns for RAG applications."""
conn_params = get_connection_params()
try:
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
with conn.cursor() as cur:
logger.info("Creating vector database tables...")
# Create documents table for document embeddings
cur.execute("""
CREATE TABLE IF NOT EXISTS documents (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
content TEXT NOT NULL,
source_url TEXT,
document_type VARCHAR(50) DEFAULT 'article',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
word_count INTEGER,
embedding vector(1536)
);
""")
# Create chunks table for document chunks (common in RAG)
cur.execute("""
CREATE TABLE IF NOT EXISTS document_chunks (
id SERIAL PRIMARY KEY,
document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
chunk_text TEXT NOT NULL,
chunk_size INTEGER,
overlap_size INTEGER DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
embedding vector(1536)
);
""")
# Create queries table for storing user queries and their embeddings
cur.execute("""
CREATE TABLE IF NOT EXISTS user_queries (
id SERIAL PRIMARY KEY,
query_text TEXT NOT NULL,
user_id VARCHAR(100),
session_id VARCHAR(100),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
response_time_ms INTEGER,
embedding vector(1536)
);
""")
# Create embeddings metadata table
cur.execute("""
CREATE TABLE IF NOT EXISTS embedding_models (
id SERIAL PRIMARY KEY,
model_name VARCHAR(100) NOT NULL UNIQUE,
provider VARCHAR(50) NOT NULL,
dimensions INTEGER NOT NULL,
max_tokens INTEGER,
cost_per_token DECIMAL(10, 8),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
is_active BOOLEAN DEFAULT TRUE
);
""")
# Create knowledge base table
cur.execute("""
CREATE TABLE IF NOT EXISTS knowledge_base (
id SERIAL PRIMARY KEY,
kb_name VARCHAR(100) NOT NULL,
description TEXT,
domain VARCHAR(50),
language VARCHAR(10) DEFAULT 'en',
total_documents INTEGER DEFAULT 0,
total_chunks INTEGER DEFAULT 0,
total_storage_mb DECIMAL(10, 2),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);
""")
# Create similarity search results cache
cur.execute("""
CREATE TABLE IF NOT EXISTS search_cache (
id SERIAL PRIMARY KEY,
query_hash VARCHAR(64) NOT NULL,
query_text TEXT NOT NULL,
results_json JSONB,
result_count INTEGER,
search_time_ms INTEGER,
similarity_threshold DECIMAL(4, 3),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
expires_at TIMESTAMP
);
""")
logger.info("Vector database tables created successfully")
conn.close()
except psycopg2.Error as e:
logger.error(f"Failed to create vector tables: {e}")
raise
def create_vector_indexes():
"""Create indexes for vector columns and other frequently queried fields."""
conn_params = get_connection_params()
try:
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
with conn.cursor() as cur:
logger.info("Creating vector indexes...")
# Vector indexes using HNSW (Hierarchical Navigable Small World)
indexes = [
("documents_embedding_idx", "documents", "embedding", "hnsw"),
("chunks_embedding_idx", "document_chunks", "embedding", "hnsw"),
("queries_embedding_idx", "user_queries", "embedding", "hnsw"),
]
for idx_name, table_name, column_name, method in indexes:
try:
if method == "hnsw":
cur.execute(f"""
CREATE INDEX IF NOT EXISTS {idx_name}
ON {table_name} USING hnsw ({column_name} vector_cosine_ops);
""")
else:
cur.execute(f"""
CREATE INDEX IF NOT EXISTS {idx_name}
ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100);
""")
logger.info(f"Created index {idx_name} on {table_name}")
except psycopg2.Error as e:
logger.warning(f"Could not create {method} index {idx_name}: {e}")
# Try with IVFFlat as fallback
if method == "hnsw":
try:
cur.execute(f"""
CREATE INDEX IF NOT EXISTS {idx_name}_ivf
ON {table_name} USING ivfflat ({column_name} vector_cosine_ops) WITH (lists = 100);
""")
logger.info(f"Created fallback IVFFlat index {idx_name}_ivf on {table_name}")
except psycopg2.Error as e2:
logger.warning(f"Could not create fallback index: {e2}")
# Regular indexes for performance
regular_indexes = [
("documents_title_idx", "documents", "title"),
("documents_type_idx", "documents", "document_type"),
("documents_created_idx", "documents", "created_at"),
("chunks_doc_id_idx", "document_chunks", "document_id"),
("chunks_index_idx", "document_chunks", "chunk_index"),
("queries_user_idx", "user_queries", "user_id"),
("queries_created_idx", "user_queries", "created_at"),
("cache_hash_idx", "search_cache", "query_hash"),
("cache_expires_idx", "search_cache", "expires_at"),
]
for idx_name, table_name, column_name in regular_indexes:
try:
cur.execute(f"CREATE INDEX IF NOT EXISTS {idx_name} ON {table_name} ({column_name});")
logger.debug(f"Created regular index {idx_name}")
except psycopg2.Error as e:
logger.warning(f"Could not create regular index {idx_name}: {e}")
logger.info("Vector indexes created successfully")
conn.close()
except psycopg2.Error as e:
logger.error(f"Failed to create vector indexes: {e}")
raise
def insert_sample_data():
"""Insert sample data into vector tables."""
conn_params = get_connection_params()
try:
conn = psycopg2.connect(**conn_params)
conn.autocommit = True
with conn.cursor() as cur:
logger.info("Inserting sample data...")
# Insert embedding models
embedding_models = [
('text-embedding-3-small', 'OpenAI', 1536, 8192, 0.00000002, True),
('text-embedding-3-large', 'OpenAI', 3072, 8192, 0.00000013, True),
('text-embedding-ada-002', 'OpenAI', 1536, 8192, 0.00000010, False),
('all-MiniLM-L6-v2', 'Sentence-Transformers', 384, 512, 0.0, True),
('all-mpnet-base-v2', 'Sentence-Transformers', 768, 514, 0.0, True),
]
for model_data in embedding_models:
cur.execute("""
INSERT INTO embedding_models (model_name, provider, dimensions, max_tokens, cost_per_token, is_active)
VALUES (%s, %s, %s, %s, %s, %s)
ON CONFLICT (model_name) DO NOTHING;
""", model_data)
# Insert knowledge bases
knowledge_bases = [
('Technical Documentation', 'Software engineering and API documentation', 'technology'),
('Research Papers', 'Academic papers and research publications', 'research'),
('Customer Support', 'FAQ and troubleshooting guides', 'support'),
('Product Catalog', 'Product descriptions and specifications', 'commerce'),
('Legal Documents', 'Contracts, policies, and legal texts', 'legal'),
]
kb_ids = []
for kb_data in knowledge_bases:
cur.execute("""
INSERT INTO knowledge_base (kb_name, description, domain, total_documents, total_chunks, total_storage_mb)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id;
""", kb_data + (random.randint(50, 500), random.randint(200, 2000), round(random.uniform(10.5, 250.8), 2)))
kb_ids.append(cur.fetchone()[0])
# Insert sample documents
sample_documents = [
("PostgreSQL Performance Tuning", "Comprehensive guide to optimizing PostgreSQL database performance including indexing strategies, query optimization, and configuration tuning.", "https://example.com/pg-performance", "technical_guide"),
("Vector Similarity Search", "Understanding vector embeddings and similarity search algorithms for AI applications and recommendation systems.", "https://example.com/vector-search", "technical_guide"),
("RAG Implementation Best Practices", "Best practices for implementing Retrieval-Augmented Generation systems using vector databases and large language models.", "https://example.com/rag-practices", "best_practices"),
("Database Security Guidelines", "Security considerations and implementation guidelines for PostgreSQL databases in production environments.", "https://example.com/db-security", "security_guide"),
("Machine Learning with SQL", "Integrating machine learning workflows with SQL databases and leveraging database extensions for AI applications.", "https://example.com/ml-sql", "tutorial"),
("API Documentation Standards", "Standards and best practices for creating comprehensive and user-friendly API documentation.", "https://example.com/api-docs", "documentation"),
("Microservices Architecture", "Design patterns and implementation strategies for microservices architecture in modern applications.", "https://example.com/microservices", "architecture_guide"),
("Data Pipeline Optimization", "Optimizing data processing pipelines for scalability, reliability, and performance in enterprise environments.", "https://example.com/data-pipelines", "optimization_guide"),
("Cloud Database Migration", "Step-by-step guide for migrating on-premises databases to cloud infrastructure with minimal downtime.", "https://example.com/cloud-migration", "migration_guide"),
("NoSQL vs SQL Comparison", "Detailed comparison of NoSQL and SQL databases, including use cases, performance characteristics, and selection criteria.", "https://example.com/nosql-sql", "comparison_guide"),
]
doc_ids = []
for title, content, url, doc_type in sample_documents:
embedding = generate_mock_embedding(1536)
word_count = len(content.split())
cur.execute("""
INSERT INTO documents (title, content, source_url, document_type, word_count, embedding)
VALUES (%s, %s, %s, %s, %s, %s)
RETURNING id;
""", (title, content, url, doc_type, word_count, embedding))
doc_ids.append(cur.fetchone()[0])
# Insert document chunks
chunk_count = 0
for doc_id in doc_ids:
# Generate 3-7 chunks per document
num_chunks = random.randint(3, 7)
for chunk_idx in range(num_chunks):
chunk_text = f"This is chunk {chunk_idx + 1} of document {doc_id}. " + \
"It contains relevant information that would be useful for similarity search and RAG applications. " + \
"The content includes technical details, examples, and best practices."
chunk_size = len(chunk_text)
overlap_size = random.randint(20, 50) if chunk_idx > 0 else 0
embedding = generate_mock_embedding(1536)
cur.execute("""
INSERT INTO document_chunks (document_id, chunk_index, chunk_text, chunk_size, overlap_size, embedding)
VALUES (%s, %s, %s, %s, %s, %s);
""", (doc_id, chunk_idx, chunk_text, chunk_size, overlap_size, embedding))
chunk_count += 1
# Insert sample user queries
sample_queries = [
("How to optimize PostgreSQL performance?", "user123", "session_abc1"),
("What are vector embeddings?", "user456", "session_def2"),
("Best practices for RAG implementation", "user789", "session_ghi3"),
("Database security checklist", "user123", "session_abc2"),
("Machine learning with databases", "user456", "session_def3"),
("API documentation examples", "user321", "session_jkl1"),
("Microservices design patterns", "user654", "session_mno2"),
("Data pipeline best practices", "user987", "session_pqr3"),
("Cloud migration strategies", "user111", "session_stu4"),
("NoSQL vs SQL databases", "user222", "session_vwx5"),
]
for query_text, user_id, session_id in sample_queries:
embedding = generate_mock_embedding(1536)
response_time = random.randint(50, 500)
cur.execute("""
INSERT INTO user_queries (query_text, user_id, session_id, response_time_ms, embedding)
VALUES (%s, %s, %s, %s, %s);
""", (query_text, user_id, session_id, response_time, embedding))
# Insert some search cache entries
for i in range(5):
query_hash = f"hash_{random.randint(100000, 999999)}"
query_text = f"Sample cached query {i + 1}"
results = [{"doc_id": random.randint(1, len(doc_ids)), "similarity": round(random.uniform(0.7, 0.95), 3)} for _ in range(3)]
result_count = len(results)
search_time = random.randint(10, 100)
threshold = round(random.uniform(0.6, 0.8), 3)
cur.execute("""
INSERT INTO search_cache (query_hash, query_text, results_json, result_count, search_time_ms, similarity_threshold)
VALUES (%s, %s, %s, %s, %s, %s);
""", (query_hash, query_text, json.dumps(results), result_count, search_time, threshold))
logger.info(f"Sample data inserted successfully:")
logger.info(f" {len(sample_documents)} documents")
logger.info(f" {chunk_count} document chunks")
logger.info(f" {len(sample_queries)} user queries")
logger.info(f" {len(embedding_models)} embedding models")
logger.info(f" {len(knowledge_bases)} knowledge bases")
conn.close()
except psycopg2.Error as e:
logger.error(f"Failed to insert sample data: {e}")
raise
def verify_vector_setup():
"""Verify that the vector database was set up correctly."""
conn_params = get_connection_params()
try:
conn = psycopg2.connect(**conn_params)
with conn.cursor() as cur:
logger.info("Verifying vector database setup...")
# Check extension
cur.execute("SELECT extname FROM pg_extension WHERE extname = 'vector';")
if cur.fetchone():
logger.info("pgvector extension is installed")
else:
logger.error("pgvector extension not found")
return False
# Check tables and record counts
tables_to_check = [
'documents', 'document_chunks', 'user_queries',
'embedding_models', 'knowledge_base', 'search_cache'
]
table_counts = {}
for table in tables_to_check:
cur.execute(f'SELECT COUNT(*) FROM {table}')
count = cur.fetchone()[0]
table_counts[table] = count
logger.info(f"Table {table}: {count} records")
# Check vector columns
cur.execute("""
SELECT table_name, column_name, data_type
FROM information_schema.columns
WHERE data_type = 'USER-DEFINED'
AND udt_name = 'vector'
ORDER BY table_name, column_name;
""")
vector_columns = cur.fetchall()
logger.info(f"Found {len(vector_columns)} vector columns:")
for table, column, dtype in vector_columns:
logger.info(f" {table}.{column} ({dtype})")
# Check indexes
cur.execute("""
SELECT schemaname, tablename, indexname, indexdef
FROM pg_indexes
WHERE indexdef LIKE '%vector%' OR indexdef LIKE '%hnsw%' OR indexdef LIKE '%ivfflat%'
ORDER BY tablename, indexname;
""")
vector_indexes = cur.fetchall()
logger.info(f"Found {len(vector_indexes)} vector indexes:")
for schema, table, index, definition in vector_indexes:
logger.info(f" {index} on {table}")
# Test a simple vector similarity query
mock_embedding = generate_mock_embedding(1536)
cur.execute("""
SELECT id, title, embedding <-> %s::vector as distance
FROM documents
ORDER BY embedding <-> %s::vector
LIMIT 3;
""", (mock_embedding, mock_embedding))
results = cur.fetchall()
logger.info(f"Vector similarity query returned {len(results)} results")
conn.close()
logger.info("Vector database verification completed successfully")
return table_counts, vector_columns, vector_indexes
except psycopg2.Error as e:
logger.error(f"Verification failed: {e}")
raise
def prepare_vector_environment():
"""Main function to prepare the vector database environment."""
logger.info("Preparing vector database environment...")
try:
# Create pgvector extension
create_vector_extension()
# Create vector tables
create_vector_tables()
# Insert sample data first
insert_sample_data()
# Create indexes after data insertion for better performance
create_vector_indexes()
# Verify the setup
table_counts, vector_columns, vector_indexes = verify_vector_setup()
logger.info("Vector database environment prepared successfully!")
logger.info(f"Total tables created: {len(table_counts)}")
logger.info(f"Total vector columns: {len(vector_columns)}")
logger.info(f"Total vector indexes: {len(vector_indexes)}")
return {
'table_counts': table_counts,
'vector_columns': vector_columns,
'vector_indexes': vector_indexes
}
except Exception as e:
logger.error(f"Failed to prepare vector environment: {e}")
raise
if __name__ == "__main__":
# Allow running this module directly for testing
logging.basicConfig(level=logging.INFO)
prepare_vector_environment()
================================================
FILE: tasks/utils/__init__.py
================================================
================================================
FILE: tasks/utils/notion_utils.py
================================================
import os
from notion_client import Client
import sys
from dotenv import load_dotenv
def get_notion_client():
# Construct the absolute path to the .env file in the project root
load_dotenv(dotenv_path=".mcp_env")
api_key = os.getenv("EVAL_NOTION_API_KEY")
if not api_key:
print(
"Error: EVAL_NOTION_API_KEY not found in environment variables.",
file=sys.stderr,
)
sys.exit(1)
return Client(auth=api_key)
def _find_object(notion: Client, title: str, object_type: str):
"""Generic helper to find a Notion page or database by title.
Args:
notion: Authenticated Notion Client.
title: Title (or partial title) to search for.
object_type: Either "page" or "database".
Returns:
The ID string if found, otherwise None.
"""
search_results = (
notion.search(
query=title, filter={"property": "object", "value": object_type}
).get("results")
or []
)
if not search_results:
return None
# Shortcut when there is only one match
if len(search_results) == 1:
return search_results[0]["id"]
# Attempt to find a case-insensitive match on the title field
for result in search_results:
if object_type == "page":
# Pages store their title inside the "properties.title.title" rich text list
title_rich_texts = (
result.get("properties", {}).get("title", {}).get("title", [])
)
else: # database
title_rich_texts = result.get("title", [])
for text_obj in title_rich_texts:
if title.lower() in text_obj.get("plain_text", "").lower():
return result["id"]
# Fallback: return the first result
return search_results[0]["id"]
def find_page(notion: Client, page_title: str):
"""Finds a page by title. Wrapper around _find_object with object_type='page'."""
return _find_object(notion, page_title, "page")
def get_page_by_id(notion: Client, page_id: str):
"""Gets a page by its ID. Returns the page object if found, None otherwise."""
try:
return notion.pages.retrieve(page_id=page_id)
except Exception:
return None
def find_page_by_id(notion: Client, page_id: str):
"""Finds a page by its ID and returns the ID if it exists, None otherwise."""
try:
notion.pages.retrieve(page_id=page_id)
return page_id
except Exception:
return None
def find_database_by_id(notion: Client, database_id: str):
"""Finds a database by its ID and returns the ID if it exists, None otherwise."""
try:
notion.databases.retrieve(database_id=database_id)
return database_id
except Exception:
return None
def find_page_or_database_by_id(notion: Client, object_id: str):
"""
Finds either a page or database by ID. Returns a tuple (object_id, object_type)
where object_type is either 'page' or 'database', or (None, None) if not found.
"""
# Try as page first
try:
notion.pages.retrieve(page_id=object_id)
return (object_id, "page")
except Exception:
pass
# Try as database
try:
notion.databases.retrieve(database_id=object_id)
return (object_id, "database")
except Exception:
pass
return (None, None)
def find_database(notion: Client, db_title: str):
"""Finds a database by title. Wrapper around _find_object with object_type='database'."""
return _find_object(notion, db_title, "database")
def find_database_in_block(notion: Client, block_id: str, db_title: str):
"""
Recursively find a database by title within a block.
"""
blocks = notion.blocks.children.list(block_id=block_id).get("results")
for block in blocks:
if (
block.get("type") == "child_database"
and block.get("child_database", {}).get("title") == db_title
):
return block["id"]
if block.get("has_children"):
db_id = find_database_in_block(notion, block["id"], db_title)
if db_id:
return db_id
return None
def get_all_blocks_recursively(notion: Client, block_id: str):
"""
Recursively fetches all blocks from a starting block ID and its children,
returning a single flat list of block objects.
"""
all_blocks = []
try:
direct_children = notion.blocks.children.list(block_id=block_id).get(
"results", []
)
except Exception:
return []
for block in direct_children:
all_blocks.append(block)
if block.get("has_children"):
all_blocks.extend(get_all_blocks_recursively(notion, block["id"]))
return all_blocks
def get_block_plain_text(block):
"""
Safely extract plain_text from a block (paragraph, heading, etc.).
"""
block_type = block.get("type")
if not block_type:
return ""
block_content = block.get(block_type)
if not block_content:
return ""
rich_text_list = block_content.get("rich_text", [])
plain_text = "".join([rt.get("plain_text", "") for rt in rich_text_list])
return plain_text
================================================
FILE: tasks/utils/postgres_utils.py
================================================
"""
PostgreSQL Data Loading Utilities for MCPMark Tasks
===================================================
Common utilities for loading data into PostgreSQL databases from CSV files
and setting up schemas in prepare_environment.py scripts.
"""
import csv
import os
import psycopg2
from pathlib import Path
from typing import Dict, List, Any, Optional
import logging
logger = logging.getLogger(__name__)
def get_connection_params() -> dict:
"""Get database connection parameters from environment variables."""
return {
"host": os.getenv("POSTGRES_HOST", "localhost"),
"port": int(os.getenv("POSTGRES_PORT", 5432)),
"database": os.getenv("POSTGRES_DATABASE"),
"user": os.getenv("POSTGRES_USERNAME"),
"password": os.getenv("POSTGRES_PASSWORD"),
}
def execute_schema_sql(conn, schema_sql: str):
"""Execute schema SQL with proper error handling."""
with conn.cursor() as cur:
cur.execute(schema_sql)
conn.commit()
logger.info("✅ Database schema created successfully")
def load_csv_to_table(
conn,
csv_file_path: Path,
table_name: str,
columns: Optional[List[str]] = None,
skip_header: bool = True
):
"""
Load CSV data into a PostgreSQL table.
Args:
conn: Database connection
csv_file_path: Path to CSV file
table_name: Target table name
columns: List of column names (if None, uses all columns)
skip_header: Whether to skip the first row
"""
if not csv_file_path.exists():
raise FileNotFoundError(f"CSV file not found: {csv_file_path}")
with conn.cursor() as cur:
with open(csv_file_path, 'r', encoding='utf-8') as f:
csv_reader = csv.reader(f)
# Skip header if needed
if skip_header:
next(csv_reader)
# Build COPY command
if columns:
copy_sql = f"COPY {table_name} ({', '.join(columns)}) FROM STDIN WITH CSV"
else:
copy_sql = f"COPY {table_name} FROM STDIN WITH CSV"
# Reset file pointer and copy data
f.seek(0)
if skip_header:
next(csv.reader(f)) # Skip header again
cur.copy_expert(copy_sql, f)
conn.commit()
logger.info(f"✅ Loaded data from {csv_file_path.name} into {table_name}")
def insert_data_from_dict(conn, table_name: str, data: List[Dict[str, Any]]):
"""
Insert data from a list of dictionaries into a table.
Args:
conn: Database connection
table_name: Target table name
data: List of dictionaries with column_name: value pairs
"""
if not data:
return
# Get column names from first record
columns = list(data[0].keys())
placeholders = ', '.join(['%s'] * len(columns))
columns_str = ', '.join(columns)
insert_sql = f"INSERT INTO {table_name} ({columns_str}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
with conn.cursor() as cur:
for row in data:
values = [row[col] for col in columns]
cur.execute(insert_sql, values)
conn.commit()
logger.info(f"✅ Inserted {len(data)} rows into {table_name}")
def create_table_with_data(
conn,
table_name: str,
schema_sql: str,
data: Optional[List[Dict[str, Any]]] = None,
data_from_csv: Optional[Path] = None
):
"""
Create a table and optionally load data.
Args:
conn: Database connection
table_name: Table name
schema_sql: CREATE TABLE SQL statement
data: Optional list of dictionaries to insert
data_from_csv: Optional CSV file to load
"""
with conn.cursor() as cur:
# Create table
cur.execute(schema_sql)
logger.info(f"✅ Created table {table_name}")
# Load data if provided
if data:
insert_data_from_dict(conn, table_name, data)
elif data_from_csv:
load_csv_to_table(conn, data_from_csv, table_name)
def setup_database_with_config(setup_config: Dict[str, Any]):
"""
Set up database using a configuration dictionary.
Args:
setup_config: Dictionary with 'tables' key containing table configurations
Example config:
{
"tables": {
"artists": {
"schema": "CREATE TABLE artists (id SERIAL PRIMARY KEY, name VARCHAR(120))",
"data": [{"id": 1, "name": "Iron Maiden"}],
"data_from_csv": "data/artists.csv" # alternative to data
}
}
}
"""
conn_params = get_connection_params()
if not conn_params["database"]:
raise ValueError("❌ No database specified in POSTGRES_DATABASE environment variable")
try:
conn = psycopg2.connect(**conn_params)
for table_name, config in setup_config["tables"].items():
schema_sql = config["schema"]
data = config.get("data")
csv_file_path = None
# Handle CSV file path
if "data_from_csv" in config:
csv_file_path = Path(config["data_from_csv"])
if not csv_file_path.is_absolute():
# Assume relative to current working directory (task directory)
csv_file_path = Path.cwd() / csv_file_path
create_table_with_data(
conn,
table_name,
schema_sql,
data=data,
data_from_csv=csv_file_path
)
conn.close()
logger.info("🎉 Database setup completed successfully")
except psycopg2.Error as e:
logger.error(f"❌ Database error during setup: {e}")
raise
except Exception as e:
logger.error(f"❌ Setup error: {e}")
raise